diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 795e1fbba9216..c100b46c38a59 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,6 +23,12 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [CI] " + - package-ecosystem: "maven" + directory: "/java/" + schedule: + interval: "weekly" + commit-message: + prefix: "MINOR: [Java] " - package-ecosystem: "npm" directory: "/js/" schedule: diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index bc11bb42366e2..1f915240e9f31 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -57,7 +57,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.8' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index f27d95c4e8cd7..fc939693c369c 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,12 +41,12 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Install Archery and Crossbow dependencies run: pip install -e arrow/dev/archery[bot] - - name: Handle Github comment event + - name: Handle GitHub comment event env: ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }} @@ -182,7 +182,7 @@ jobs: if: github.event.comment.body == 'take' runs-on: ubuntu-latest steps: - - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index e6ae6c60b0f4c..2e3c2a355a884 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -193,19 +193,13 @@ jobs: submodules: recursive - name: Install Dependencies run: | - rm -f /usr/local/bin/2to3* || : - rm -f /usr/local/bin/idle3* || : - rm -f /usr/local/bin/pydoc3* || : - rm -f /usr/local/bin/python3* || : - rm -f /usr/local/bin/python3-config || : - brew update --preinstall || : brew bundle --file=cpp/Brewfile - name: Install MinIO run: | $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest /usr/local - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.9 - name: Install Google Cloud Storage Testbench @@ -432,7 +426,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.9 - name: Install Google Cloud Storage Testbench diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 3d1e513bc609c..17ef2de81088f 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -49,7 +49,7 @@ jobs: dotnet: ['7.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v3 + uses: actions/setup-dotnet@v4 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -77,7 +77,7 @@ jobs: dotnet: ['7.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v3 + uses: actions/setup-dotnet@v4 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow @@ -104,7 +104,7 @@ jobs: dotnet: ['7.0.x'] steps: - name: Install C# - uses: actions/setup-dotnet@v3 + uses: actions/setup-dotnet@v4 with: dotnet-version: ${{ matrix.dotnet }} - name: Checkout Arrow diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index df2b20a9e3c77..4892767324335 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -41,7 +41,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -88,7 +88,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: '3.8' - name: Install Ruby @@ -96,7 +96,7 @@ jobs: with: ruby-version: '2.7' - name: Install .NET - uses: actions/setup-dotnet@3447fd6a9f9e57506b15f895c5b76d3b197dc7c2 # v3.2.0 + uses: actions/setup-dotnet@4d6c8fcf3c8f7a60068d26b594648e99df24cee3 # v4.0.0 with: dotnet-version: '7.0.x' - name: Install Dependencies diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 78b01b561f3cb..10b33c96d2129 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -53,7 +53,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -64,7 +64,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -75,7 +75,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: debug: true github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/dev_pr/issue_check.js b/.github/workflows/dev_pr/issue_check.js index 75e86b40923c2..fb5d986dff2f7 100644 --- a/.github/workflows/dev_pr/issue_check.js +++ b/.github/workflows/dev_pr/issue_check.js @@ -103,7 +103,7 @@ async function commentNotStartedTicket(github, context, pullRequestNumber) { } /** - * Assigns the Github Issue to the PR creator. + * Assigns the GitHub Issue to the PR creator. * * @param {Object} github * @param {Object} context diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b30e1eb8809db..098b5ff29df5a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -51,7 +51,7 @@ jobs: key: ubuntu-docs-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-docs- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index e96ccecdff598..8d10060c9d8a0 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -57,7 +57,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 11668aaf1b301..ded6985f9ed5c 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -167,7 +167,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -207,7 +207,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -299,7 +299,7 @@ jobs: github.event_name == 'push' && github.repository == 'apache/arrow' && github.ref_name == 'main' - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: '3.10' - name: Run Benchmarks diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index bd99b62a2fe02..4960b4dbd61e8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -81,7 +81,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml index 86d1858c8c596..ec614ca1e7c56 100644 --- a/.github/workflows/issue_bot.yml +++ b/.github/workflows/issue_bot.yml @@ -33,7 +33,7 @@ jobs: if: github.event.issue.pull_request == null runs-on: ubuntu-latest steps: - - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | let split_body = context.payload.issue.body.split('### Component(s)'); @@ -64,7 +64,7 @@ jobs: "per_page": 100, }); - // this removes non-existent labels + // this removes nonexistent labels component_labels = component_labels.filter( label => repo_labels.data.some(repo_label => repo_label.name === label) ); diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 69adc184b7fe7..ee4c1b21c37d4 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -48,7 +48,6 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: - ubuntu: name: AMD64 Ubuntu 22.04 Java JDK ${{ matrix.jdk }} Maven ${{ matrix.maven }} runs-on: ubuntu-latest @@ -76,7 +75,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -109,7 +108,7 @@ jobs: jdk: [11] steps: - name: Set up Java - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: 'zulu' java-version: ${{ matrix.jdk }} @@ -136,7 +135,7 @@ jobs: jdk: [11] steps: - name: Set up Java - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: java-version: ${{ matrix.jdk }} distribution: 'temurin' diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 76b10b828ee49..9f05a357a11d3 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -48,7 +48,6 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: - docker: name: AMD64 manylinux2014 Java JNI runs-on: ubuntu-latest @@ -70,7 +69,7 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -110,7 +109,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index 11aa4e59beefd..c19576d2f659e 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: cache: 'pip' python-version: 3.8 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index b2040a76dec48..e2c76a3d1cb24 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -51,7 +51,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 596d3511a543d..31ab32800705c 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -40,7 +40,7 @@ jobs: - name: 'Download PR review payload' id: 'download' if: github.event_name == 'workflow_run' - uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | const run_id = "${{ github.event.workflow_run.id }}"; @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index d201f90101de8..d9979da0ee12a 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -100,7 +100,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -161,21 +161,13 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install Dependencies shell: bash run: | - rm -f /usr/local/bin/2to3* || : - rm -f /usr/local/bin/idle3* || : - rm -f /usr/local/bin/pydoc3* || : - rm -f /usr/local/bin/python3* || : - rm -f /usr/local/bin/python3-config || : - brew update --preinstall || : - brew install --overwrite git brew bundle --file=cpp/Brewfile - brew install coreutils python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index db10e6f28ce1c..4fc308a28d4d6 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -60,7 +60,7 @@ jobs: strategy: fail-fast: false matrix: - r: ["4.2"] + r: ["4.3"] ubuntu: [20.04] force-tests: ["true"] env: @@ -83,7 +83,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -144,7 +144,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -215,6 +215,9 @@ jobs: r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}- - uses: r-lib/actions/setup-r@v2 with: + # Note: RTools must be 40 here because RTools40 + ucrt is how we build the Arrow C++ + # static library. The R is not used here but R 4.1 was the last R to use + # Rtools40. r-version: "4.1" rtools-version: 40 Ncpus: 2 @@ -234,7 +237,7 @@ jobs: windows-r: needs: [windows-cpp] - name: AMD64 Windows R ${{ matrix.config.rversion }} RTools ${{ matrix.config.rtools }} + name: AMD64 Windows R ${{ matrix.config.rversion }} runs-on: windows-2019 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 @@ -242,8 +245,8 @@ jobs: fail-fast: false matrix: config: - - { rtools: 42, rversion: "4.2" } - - { rtools: 42, rversion: "devel" } + - { rversion: "release" } + env: ARROW_R_CXXFLAGS: "-Werror" _R_CHECK_TESTS_NLINES_: 0 @@ -255,7 +258,6 @@ jobs: fetch-depth: 0 - run: mkdir r/windows - name: Download artifacts - if: ${{ matrix.config.rtools == 42 }} uses: actions/download-artifact@v3 with: name: libarrow-rtools40-ucrt64.zip @@ -269,7 +271,6 @@ jobs: - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.rversion }} - rtools-version: ${{ matrix.config.rtools }} Ncpus: 2 - uses: r-lib/actions/setup-r-dependencies@v2 env: @@ -318,7 +319,7 @@ jobs: timeout = 3600 ) - name: Run lintr - if: ${{ matrix.config.rversion == '4.2' }} + if: ${{ matrix.config.rversion == 'release' }} env: NOT_CRAN: "true" GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 5a34239721392..27a32d22f90c0 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: cache: 'pip' python-version: 3.8 diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index b9a4ac03b6108..300d49742b713 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -82,7 +82,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 with: python-version: 3.8 - name: Setup Archery @@ -149,13 +149,6 @@ jobs: - name: Install Homebrew Dependencies shell: bash run: | - rm -f /usr/local/bin/2to3* || : - rm -f /usr/local/bin/idle3* || : - rm -f /usr/local/bin/pydoc3* || : - rm -f /usr/local/bin/python3* || : - rm -f /usr/local/bin/python3-config || : - brew update --preinstall || : - brew install --overwrite git brew bundle --file=cpp/Brewfile brew bundle --file=c_glib/Brewfile - name: Install Ruby Dependencies diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ecdf628355ea..6101f5d3cac25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1459,7 +1459,7 @@ * [ARROW-12172](https://issues.apache.org/jira/browse/ARROW-12172) - [Python][Packaging] Pass python version as setuptools pretend version in the macOS wheel builds * [ARROW-12178](https://issues.apache.org/jira/browse/ARROW-12178) - [CI] Update setuptools in the ubuntu images * [ARROW-12186](https://issues.apache.org/jira/browse/ARROW-12186) - [Rust][DataFusion] Fix regexp_match test -* [ARROW-12209](https://issues.apache.org/jira/browse/ARROW-12209) - [JS] Copy all src files into the the TypeScript package +* [ARROW-12209](https://issues.apache.org/jira/browse/ARROW-12209) - [JS] Copy all src files into the TypeScript package * [ARROW-12220](https://issues.apache.org/jira/browse/ARROW-12220) - [C++][CI] Thread sanitizer failure * [ARROW-12226](https://issues.apache.org/jira/browse/ARROW-12226) - [C++] Fix Address Sanitizer failures * [ARROW-12227](https://issues.apache.org/jira/browse/ARROW-12227) - [R] Fix RE2 and median nightly build failures @@ -11430,7 +11430,7 @@ * [ARROW-67](https://issues.apache.org/jira/browse/ARROW-67) - C++ metadata flatbuffer serialization and data movement to memory maps * [ARROW-68](https://issues.apache.org/jira/browse/ARROW-68) - Better error handling for not fully setup systems * [ARROW-70](https://issues.apache.org/jira/browse/ARROW-70) - Add adapt 'lite' DCHECK macros from Kudu as also used in Parquet -* [ARROW-71](https://issues.apache.org/jira/browse/ARROW-71) - [C++] Add clang-tidy and clang-format to the the tool chain. +* [ARROW-71](https://issues.apache.org/jira/browse/ARROW-71) - [C++] Add clang-tidy and clang-format to the tool chain. * [ARROW-73](https://issues.apache.org/jira/browse/ARROW-73) - Support older CMake versions * [ARROW-76](https://issues.apache.org/jira/browse/ARROW-76) - Revise format document to include null count, defer non-nullable arrays to the domain of metadata * [ARROW-78](https://issues.apache.org/jira/browse/ARROW-78) - C++: Add constructor for DecimalType diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 95029d98f7a01..2cdd1d42634bf 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=14.0.0.9000 +pkgver=14.0.1.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh index 94f75e501ea0b..ea77ecf56ac0e 100755 --- a/ci/scripts/go_build.sh +++ b/ci/scripts/go_build.sh @@ -21,6 +21,9 @@ set -ex source_dir=${1}/go +# Need "all=" as per https://github.com/golang/go/issues/42131#issuecomment-713917379 +export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr" + pushd ${source_dir}/arrow if [[ -n "${ARROW_GO_TESTCGO}" ]]; then diff --git a/ci/scripts/go_cgo_python_test.sh b/ci/scripts/go_cgo_python_test.sh index 5f2032fba92f4..ef98e414bbf1e 100755 --- a/ci/scripts/go_cgo_python_test.sh +++ b/ci/scripts/go_cgo_python_test.sh @@ -21,6 +21,8 @@ set -ex source_dir=${1}/go +export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr" + pushd ${source_dir}/arrow/cdata/test case "$(uname)" in diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index b5a38f01412d4..a5a012ad2c5c4 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -43,6 +43,12 @@ fi # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 +# Due to how Go reads environment variables, we have to set them from the calling +# process, or they would get ignored. +# (see https://forum.golangbridge.org/t/are-godebug-and-other-env-vars-ignored-when-loading-a-go-dll-from-foreign-code/33694) +export GOMEMLIMIT=200MiB +export GODEBUG=gctrace=1,clobberfree=1 + # Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1 time archery integration \ --run-c-data \ diff --git a/ci/scripts/integration_dask.sh b/ci/scripts/integration_dask.sh index f91d21b921708..bf306dc652239 100755 --- a/ci/scripts/integration_dask.sh +++ b/ci/scripts/integration_dask.sh @@ -32,7 +32,9 @@ python -c "import dask.dataframe" # pytest -sv --pyargs dask.bytes.tests.test_local # The "skip_with_pyarrow_strings" marker is meant to skip automatically, but that doesn't work with --pyargs, so de-selecting manually -pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings" +# - The 'test_categorize_info' test is failing because of change in StringArray's nbytes and +# an upstream fix (https://github.com/apache/arrow/issues/39028) +pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings" -k "not test_categorize_info" pytest -v --pyargs dask.dataframe.io.tests.test_orc pytest -v --pyargs dask.dataframe.io.tests.test_parquet \ -m "not skip_with_pyarrow_strings and not xfail_with_pyarrow_strings" diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 66ea8d677d187..77dd1ccdafa09 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -77,20 +77,20 @@ mvn="${mvn} -T 2C" pushd ${source_dir} -${mvn} install - if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then - ${mvn} -Pshade-flatbuffers install + mvn="${mvn} -Pshade-flatbuffers" fi if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then - ${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data install + mvn="${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data" fi if [ "${ARROW_JAVA_JNI}" = "ON" ]; then - ${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni install + mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni" fi +${mvn} install + if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference diff --git a/ci/scripts/rust_build.sh b/ci/scripts/rust_build.sh index 2dfc0f1b1892d..5fc21d454b080 100755 --- a/ci/scripts/rust_build.sh +++ b/ci/scripts/rust_build.sh @@ -21,6 +21,7 @@ set -e arrow_dir=${1} source_dir=${1}/rust +build_dir=${2}/rust # This file is used to build the rust binaries needed for the archery # integration tests. Testing of the rust implementation in normal CI is handled @@ -54,7 +55,7 @@ rustup show pushd ${source_dir} # build only the integration testing binaries -cargo build -p arrow-integration-testing +cargo build -p arrow-integration-testing --target-dir ${build_dir} # Save disk space by removing large temporary build products rm -rf target/debug/deps diff --git a/cmake-format.py b/cmake-format.py index 3e77733f4d1ee..b8fc8939692da 100644 --- a/cmake-format.py +++ b/cmake-format.py @@ -72,5 +72,5 @@ first_comment_is_literal = True # If comment markup is enabled, don't reflow any comment block which - # matchesthis (regex) pattern. Default is `None` (disabled). + # matches this (regex) pattern. Default is `None` (disabled). literal_comment_pattern = None diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bcb298407bd8b..9f17350b2505a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -873,9 +873,6 @@ add_dependencies(arrow_test_dependencies toolchain-tests) if(ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) if(ARROW_HDFS OR ARROW_ORC) - if(Protobuf_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}) - endif() if(NOT MSVC_TOOLCHAIN) list(APPEND ARROW_STATIC_LINK_LIBS ${CMAKE_DL_LIBS}) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index f6324c1c0a96d..a15b204c39757 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -430,6 +430,21 @@ ], "displayName": "Benchmarking build with with everything enabled", "cacheVariables": {} + }, + { + "name": "fuzzing", + "inherits": "base", + "displayName": "Debug build with IPC and Parquet fuzzing targets", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_C_COMPILER": "clang", + "CMAKE_CXX_COMPILER": "clang++", + "ARROW_USE_ASAN": "ON", + "ARROW_USE_UBSAN": "ON", + "ARROW_IPC": "ON", + "ARROW_PARQUET": "ON", + "ARROW_FUZZING": "ON" + } } ] } diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 52632d554aafb..978f0319837da 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2388,10 +2388,10 @@ if(ARROW_USE_XSIMD) resolve_dependency(xsimd FORCE_ANY_NEWER_VERSION TRUE + IS_RUNTIME_DEPENDENCY + FALSE REQUIRED_VERSION - "8.1.0" - PC_PACKAGE_NAMES - xsimd) + "8.1.0") if(xsimd_SOURCE STREQUAL "BUNDLED") add_library(arrow::xsimd INTERFACE IMPORTED) @@ -5063,6 +5063,9 @@ if(ARROW_S3) string(APPEND ARROW_PC_REQUIRES_PRIVATE " libcurl") endif() string(APPEND ARROW_PC_REQUIRES_PRIVATE " openssl") + if(APPLE) + string(APPEND ARROW_PC_LIBS_PRIVATE " -framework Security") + endif() endif() endif() diff --git a/cpp/examples/parquet/low_level_api/reader_writer2.cc b/cpp/examples/parquet/low_level_api/reader_writer2.cc index 65dd5799e95af..0a8831710b4c0 100644 --- a/cpp/examples/parquet/low_level_api/reader_writer2.cc +++ b/cpp/examples/parquet/low_level_api/reader_writer2.cc @@ -98,7 +98,7 @@ int main(int argc, char** argv) { static_cast(rg_writer->column(col_id)); bool bool_value = ((i % 2) == 0) ? true : false; bool_writer->WriteBatch(1, nullptr, nullptr, &bool_value); - buffered_values_estimate[col_id] = bool_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = bool_writer->estimated_buffered_value_bytes(); // Write the Int32 column col_id++; @@ -106,7 +106,7 @@ int main(int argc, char** argv) { static_cast(rg_writer->column(col_id)); int32_t int32_value = i; int32_writer->WriteBatch(1, nullptr, nullptr, &int32_value); - buffered_values_estimate[col_id] = int32_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = int32_writer->estimated_buffered_value_bytes(); // Write the Int64 column. Each row has repeats twice. col_id++; @@ -119,7 +119,7 @@ int main(int argc, char** argv) { int64_t int64_value2 = (2 * i + 1); repetition_level = 1; // start of a new record int64_writer->WriteBatch(1, &definition_level, &repetition_level, &int64_value2); - buffered_values_estimate[col_id] = int64_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = int64_writer->estimated_buffered_value_bytes(); // Write the INT96 column. col_id++; @@ -130,7 +130,7 @@ int main(int argc, char** argv) { int96_value.value[1] = i + 1; int96_value.value[2] = i + 2; int96_writer->WriteBatch(1, nullptr, nullptr, &int96_value); - buffered_values_estimate[col_id] = int96_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = int96_writer->estimated_buffered_value_bytes(); // Write the Float column col_id++; @@ -138,7 +138,7 @@ int main(int argc, char** argv) { static_cast(rg_writer->column(col_id)); float float_value = static_cast(i) * 1.1f; float_writer->WriteBatch(1, nullptr, nullptr, &float_value); - buffered_values_estimate[col_id] = float_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = float_writer->estimated_buffered_value_bytes(); // Write the Double column col_id++; @@ -146,7 +146,7 @@ int main(int argc, char** argv) { static_cast(rg_writer->column(col_id)); double double_value = i * 1.1111111; double_writer->WriteBatch(1, nullptr, nullptr, &double_value); - buffered_values_estimate[col_id] = double_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = double_writer->estimated_buffered_value_bytes(); // Write the ByteArray column. Make every alternate values NULL col_id++; @@ -166,7 +166,7 @@ int main(int argc, char** argv) { int16_t definition_level = 0; ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr); } - buffered_values_estimate[col_id] = ba_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = ba_writer->estimated_buffered_value_bytes(); // Write the FixedLengthByteArray column col_id++; @@ -178,7 +178,7 @@ int main(int argc, char** argv) { flba_value.ptr = reinterpret_cast(&flba[0]); flba_writer->WriteBatch(1, nullptr, nullptr, &flba_value); - buffered_values_estimate[col_id] = flba_writer->EstimatedBufferedValueBytes(); + buffered_values_estimate[col_id] = flba_writer->estimated_buffered_value_bytes(); } // Close the RowGroupWriter diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 24e8eefad1523..46a7aa910633d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -223,12 +223,14 @@ set(ARROW_SRCS util/debug.cc util/decimal.cc util/delimiting.cc + util/dict_util.cc util/float16.cc util/formatting.cc util/future.cc util/hashing.cc util/int_util.cc util/io_util.cc + util/list_util.cc util/logging.cc util/key_value_metadata.cc util/memory.cc @@ -597,7 +599,7 @@ if(ARROW_BUILD_BUNDLED_DEPENDENCIES) IMPORTED_LOCATION) install(FILES ${arrow_bundled_dependencies_path} ${INSTALL_IS_OPTIONAL} DESTINATION ${CMAKE_INSTALL_LIBDIR}) - string(APPEND ARROW_PC_LIBS_PRIVATE " -larrow_bundled_dependencies") + string(PREPEND ARROW_PC_LIBS_PRIVATE " -larrow_bundled_dependencies") list(INSERT ARROW_STATIC_INSTALL_INTERFACE_LIBS 0 "Arrow::arrow_bundled_dependencies") endif() @@ -790,6 +792,7 @@ add_arrow_test(array_test array/array_binary_test.cc array/array_dict_test.cc array/array_list_test.cc + array/array_list_view_test.cc array/array_run_end_test.cc array/array_struct_test.cc array/array_union_test.cc diff --git a/cpp/src/arrow/acero/accumulation_queue.h b/cpp/src/arrow/acero/accumulation_queue.h index 285790207f93c..a27b8b399ce47 100644 --- a/cpp/src/arrow/acero/accumulation_queue.h +++ b/cpp/src/arrow/acero/accumulation_queue.h @@ -82,7 +82,7 @@ class SequencingQueue { /// This method will be called on each batch in order. Calls to this method /// will be serialized and it will not be called reentrantly. This makes it /// safe to do things that rely on order but minimal time should be spent here - /// to avoid becoming a bottlneck. + /// to avoid becoming a bottleneck. /// /// \return a follow-up task that will be scheduled. The follow-up task(s) are /// is not guaranteed to run in any particular order. If nullopt is diff --git a/cpp/src/arrow/acero/aggregate_internal.h b/cpp/src/arrow/acero/aggregate_internal.h index 72537a7f7e3fe..5730d99f93f88 100644 --- a/cpp/src/arrow/acero/aggregate_internal.h +++ b/cpp/src/arrow/acero/aggregate_internal.h @@ -224,7 +224,7 @@ class ScalarAggregateNode : public ExecNode, public TracedNode { // Field indices corresponding to the segment-keys const std::vector segment_field_ids_; // Holds the value of segment keys of the most recent input batch - // The values are updated everytime an input batch is processed + // The values are updated every time an input batch is processed std::vector segmenter_values_; const std::vector> target_fieldsets_; diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 4a3b6b199c4c0..2609905a0b552 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -472,7 +472,7 @@ class BackpressureController : public BackpressureControl { }; class InputState { - // InputState correponds to an input + // InputState corresponds to an input // Input record batches are queued up in InputState until processed and // turned into output record batches. @@ -1453,7 +1453,7 @@ class AsofJoinNode : public ExecNode { bool must_hash_; bool may_rehash_; // InputStates - // Each input state correponds to an input table + // Each input state corresponds to an input table std::vector> state_; std::mutex gate_; TolType tolerance_; diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index df3172b2a09bc..e400cc031693a 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -604,7 +604,7 @@ struct BasicTest { auto r0_types = init_types(all_types, [](T& t) { return t->byte_width() > 1; }); auto r1_types = init_types(all_types, [](T& t) { return t->byte_width() > 1; }); - // sample a limited number of type-combinations to keep the runnning time reasonable + // sample a limited number of type-combinations to keep the running time reasonable // the scoped-traces below help reproduce a test failure, should it happen auto start_time = std::chrono::system_clock::now(); auto seed = start_time.time_since_epoch().count(); @@ -1279,7 +1279,7 @@ TRACED_TEST(AsofJoinTest, TestUnsupportedOntype, { field("r0_v0", float32())})); }) -TRACED_TEST(AsofJoinTest, TestUnsupportedBytype, { +TRACED_TEST(AsofJoinTest, TestUnsupportedByType, { DoRunInvalidTypeTest(schema({field("time", int64()), field("key", list(int32())), field("l_v0", float64())}), schema({field("time", int64()), field("key", list(int32())), diff --git a/cpp/src/arrow/acero/expression_benchmark.cc b/cpp/src/arrow/acero/expression_benchmark.cc index 9799446ed6524..a57dd6b9e3f22 100644 --- a/cpp/src/arrow/acero/expression_benchmark.cc +++ b/cpp/src/arrow/acero/expression_benchmark.cc @@ -107,7 +107,7 @@ static void ExecuteScalarExpressionOverhead(benchmark::State& state, Expression } /// \brief Baseline benchmarks are implemented in pure C++ without arrow for performance -/// comparision. +/// comparison. template void ExecuteScalarExpressionBaseline(benchmark::State& state) { const auto rows_per_batch = static_cast(state.range(0)); @@ -193,7 +193,7 @@ BENCHMARK_CAPTURE(BindAndEvaluate, nested_scalar, struct ComplexExpressionBaseline { public: ComplexExpressionBaseline(size_t input_size) { - /* hack - cuts off a few elemets if the input size is not a multiple of 64 for + /* hack - cuts off a few elements if the input size is not a multiple of 64 for * simplicity. We can't use std::vector here since it slows down things * massively */ less_20.resize(input_size / 64); diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 02e67927cc03f..a4874f3581040 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -261,7 +261,7 @@ Result MakeGroupByOutput(const std::vector& output_batches, return struct_arr; } - // The exec plan may reorder the output rows. The tests are all setup to expect ouptut + // The exec plan may reorder the output rows. The tests are all setup to expect output // in ascending order of keys. So we need to sort the result by the key columns. To do // that we create a table using the key columns, calculate the sort indices from that // table (sorting on all fields) and then use those indices to calculate our result. diff --git a/cpp/src/arrow/acero/partition_util.h b/cpp/src/arrow/acero/partition_util.h index 27cde61d58797..1413a8326ade0 100644 --- a/cpp/src/arrow/acero/partition_util.h +++ b/cpp/src/arrow/acero/partition_util.h @@ -33,11 +33,11 @@ class PartitionSort { public: /// \brief Bucket sort rows on partition ids in O(num_rows) time. /// - /// Include in the output exclusive cummulative sum of bucket sizes. + /// Include in the output exclusive cumulative sum of bucket sizes. /// This corresponds to ranges in the sorted array containing all row ids for /// each of the partitions. /// - /// prtn_ranges must be initailized and have at least prtn_ranges + 1 elements + /// prtn_ranges must be initialized and have at least prtn_ranges + 1 elements /// when this method returns prtn_ranges[i] will contains the total number of /// elements in partitions 0 through i. prtn_ranges[0] will be 0. /// diff --git a/cpp/src/arrow/acero/pivot_longer_node.cc b/cpp/src/arrow/acero/pivot_longer_node.cc index e54f00a20be3f..ea5ca44baa10b 100644 --- a/cpp/src/arrow/acero/pivot_longer_node.cc +++ b/cpp/src/arrow/acero/pivot_longer_node.cc @@ -135,7 +135,7 @@ class PivotLongerNode : public ExecNode, public TracedNode { for (std::size_t i = 0; i < measurement_types.size(); i++) { if (!measurement_types[i]) { return Status::Invalid( - "All row templates had nullopt for the meausrement column at index ", i, " (", + "All row templates had nullopt for the measurement column at index ", i, " (", options.measurement_field_names[i], ")"); } fields.push_back( diff --git a/cpp/src/arrow/acero/sorted_merge_node.cc b/cpp/src/arrow/acero/sorted_merge_node.cc index f3b934eda186b..4d4565a6bb5e7 100644 --- a/cpp/src/arrow/acero/sorted_merge_node.cc +++ b/cpp/src/arrow/acero/sorted_merge_node.cc @@ -95,7 +95,7 @@ class BackpressureController : public BackpressureControl { std::atomic& backpressure_counter_; }; -/// InputState correponds to an input. Input record batches are queued up in InputState +/// InputState corresponds to an input. Input record batches are queued up in InputState /// until processed and turned into output record batches. class InputState { public: diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 3f11b89af39de..2f79ed299bb70 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1433,16 +1433,16 @@ void SwissTableForJoinBuild::PrtnMerge(int prtn_id) { if (!no_payload_) { // Count sort payloads on key id // - // Start by computing inclusive cummulative sum of counters. + // Start by computing inclusive cumulative sum of counters. // uint32_t sum = 0; for (int64_t i = 0; i < num_keys; ++i) { sum += counters[i]; counters[i] = sum; } - // Now use cummulative sum of counters to obtain the target position in + // Now use cumulative sum of counters to obtain the target position in // the sorted order for each row. At the end of this process the counters - // will contain exclusive cummulative sum (instead of inclusive that is + // will contain exclusive cumulative sum (instead of inclusive that is // there at the beginning). // source_payload_ids.resize(prtn_state.key_ids.size()); @@ -1458,7 +1458,7 @@ void SwissTableForJoinBuild::PrtnMerge(int prtn_id) { } } else { // When there is no payload to process, we just need to compute exclusive - // cummulative sum of counters and add the base payload id to all of them. + // cumulative sum of counters and add the base payload id to all of them. // uint32_t sum = 0; for (int64_t i = 0; i < num_keys; ++i) { diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index 88b80f06f57f2..6403b7a655e96 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -156,7 +156,7 @@ class RowArrayMerge { // All input sources must be initialized, but they can contain zero rows. // // Output in vector the first target row id for each source (exclusive - // cummulative sum of number of rows in sources). This output is optional, + // cumulative sum of number of rows in sources). This output is optional, // caller can pass in nullptr to indicate that it is not needed. // static Status PrepareForMerge(RowArray* target, const std::vector& sources, @@ -235,7 +235,7 @@ class SwissTableMerge { // All input sources must be initialized, but they can be empty. // // Output in a vector the first target group id for each source (exclusive - // cummulative sum of number of groups in sources). This output is optional, + // cumulative sum of number of groups in sources). This output is optional, // caller can pass in nullptr to indicate that it is not needed. // static Status PrepareForMerge(SwissTable* target, diff --git a/cpp/src/arrow/acero/union_node.cc b/cpp/src/arrow/acero/union_node.cc index 054fcdaba24be..dc3ee102d4b07 100644 --- a/cpp/src/arrow/acero/union_node.cc +++ b/cpp/src/arrow/acero/union_node.cc @@ -80,6 +80,9 @@ class UnionNode : public ExecNode, public TracedNode { NoteInputReceived(batch); ARROW_DCHECK(std::find(inputs_.begin(), inputs_.end(), input) != inputs_.end()); + if (inputs_.size() > 1) { + batch.index = compute::kUnsequencedIndex; + } return output_->InputReceived(this, std::move(batch)); } diff --git a/cpp/src/arrow/acero/union_node_test.cc b/cpp/src/arrow/acero/union_node_test.cc index 8c07ece939a9e..d925ac378eab4 100644 --- a/cpp/src/arrow/acero/union_node_test.cc +++ b/cpp/src/arrow/acero/union_node_test.cc @@ -63,8 +63,9 @@ struct TestUnionNode : public ::testing::Test { out_batches->batches.push_back(empty_record_batch); } else { for (size_t j = 0; j < num_batches; j++) { - out_batches->batches.push_back( - ExecBatch(*rng_.BatchOf(schema->fields(), batch_size))); + auto out_batch = ExecBatch(*rng_.BatchOf(schema->fields(), batch_size)); + out_batch.index = j; + out_batches->batches.push_back(std::move(out_batch)); } } @@ -108,6 +109,13 @@ struct TestUnionNode : public ::testing::Test { auto expected_matcher = Finishes(ResultWith(UnorderedElementsAreArray(exp_batches.batches))); ASSERT_THAT(actual, expected_matcher); + + // union node with multiple inputs should produce unordered batches + if (batches.size() > 1) { + for (const auto& batch : *actual.result()) { + ASSERT_EQ(batch.index, compute::kUnsequencedIndex); + } + } } void CheckUnionExecNode(size_t num_input_nodes, size_t num_batches, bool parallel) { diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index eab71de27b11a..b483ec420cc3c 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -95,7 +95,7 @@ struct ScalarFromArraySlotImpl { Status Visit(const MonthDayNanoIntervalArray& a) { return Finish(a.Value(index_)); } template - Status Visit(const BaseListArray& a) { + Status Visit(const VarLengthListLikeArray& a) { return Finish(a.value_slice(index_)); } diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 28fccdbfcffee..7fd76a1dae81b 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -212,7 +212,7 @@ Result> TransposeDictIndices( return out_data; } -struct CompactTransposeMapVistor { +struct CompactTransposeMapVisitor { const std::shared_ptr& data; arrow::MemoryPool* pool; std::unique_ptr output_map; @@ -306,11 +306,11 @@ Result> CompactTransposeMap( } const auto& dict_type = checked_cast(*data->type); - CompactTransposeMapVistor vistor{data, pool, nullptr, nullptr}; - RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); + CompactTransposeMapVisitor visitor{data, pool, nullptr, nullptr}; + RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &visitor)); - out_compact_dictionary = vistor.out_compact_dictionary; - return std::move(vistor.output_map); + out_compact_dictionary = visitor.out_compact_dictionary; + return std::move(visitor.output_map); } } // namespace diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index a3a2f99851b55..0b591d401804d 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -41,10 +41,11 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using ListTypes = ::testing::Types; +using ListAndListViewTypes = + ::testing::Types; // ---------------------------------------------------------------------- -// List tests +// List and ListView tests template class TestListArray : public ::testing::Test { @@ -57,7 +58,9 @@ class TestListArray : public ::testing::Test { using OffsetArrayType = typename TypeTraits::OffsetArrayType; using OffsetBuilderType = typename TypeTraits::OffsetBuilderType; - void SetUp() { + static constexpr bool kTypeClassIsListView = is_list_view_type::value; + + void SetUp() override { value_type_ = int16(); type_ = std::make_shared(value_type_); @@ -72,8 +75,10 @@ class TestListArray : public ::testing::Test { result_ = std::dynamic_pointer_cast(out); } - void ValidateBasicListArray(const ArrayType* result, const std::vector& values, - const std::vector& is_valid) { + private: + void DoValidateBasicListArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { ASSERT_OK(result->ValidateFull()); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -108,6 +113,58 @@ class TestListArray : public ::testing::Test { result_->raw_value_offsets()[result->length()]); } + void DoValidateBasicListViewArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + std::vector ex_offsets = {0, 3, 3}; + std::vector ex_sizes = {3, 0}; + for (size_t i = 0; i < ex_sizes.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); + ASSERT_EQ(ex_sizes[i], result->value_length(i)); + } + ASSERT_EQ(ex_offsets[ex_sizes.size()], result->value_offset(ex_sizes.size())); + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + auto varr = std::dynamic_pointer_cast(result->values()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } + + auto offsets = std::dynamic_pointer_cast(result->offsets()); + auto sizes = std::dynamic_pointer_cast(result->sizes()); + ASSERT_EQ(offsets->length(), result->length()); + ASSERT_EQ(offsets->null_count(), 0); + AssertTypeEqual(*offsets->type(), OffsetType()); + ASSERT_EQ(sizes->length(), result->length()); + ASSERT_EQ(sizes->null_count(), 0); + AssertTypeEqual(*sizes->type(), OffsetType()); + + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(offsets->Value(i), result_->raw_value_offsets()[i]); + ASSERT_EQ(sizes->Value(i), result_->raw_value_sizes()[i]); + } + } + + void ValidateBasicListArray(const ArrayType* result, const std::vector& values, + const std::vector& is_valid) { + if constexpr (kTypeClassIsListView) { + return DoValidateBasicListViewArray(result, values, is_valid); + } else { + return DoValidateBasicListArray(result, values, is_valid); + } + } + + public: void TestBasics() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector lengths = {3, 0, 4}; @@ -120,7 +177,7 @@ class TestListArray : public ::testing::Test { int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_valid[i] > 0)); + ASSERT_OK(builder_->Append(is_valid[i] > 0, lengths[i])); for (int j = 0; j < lengths[i]; ++j) { ASSERT_OK(vb->Append(values[pos++])); } @@ -133,25 +190,29 @@ class TestListArray : public ::testing::Test { void TestEquality() { auto vb = checked_cast(builder_->value_builder()); - std::shared_ptr array, equal_array, unequal_array; + std::shared_ptr array, equal_array; std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + std::vector equal_sizes = {1, 1, 3, 1, 1, 1, 2, 0}; std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + + std::shared_ptr unequal_array; std::vector unequal_offsets = {0, 1, 4, 7}; + std::vector unequal_sizes = {1, 3, 3, 0}; std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; - // setup two equal arrays - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); - ASSERT_OK(builder_->Finish(&array)); - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); - ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); + ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); ASSERT_OK(builder_->Finish(&equal_array)); - // now an unequal one - ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size())); - ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); + ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_sizes.data(), + unequal_offsets.size())); + ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality @@ -197,16 +258,37 @@ class TestListArray : public ::testing::Test { EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); } - void TestFromArraysWithNullBitMap() { - std::shared_ptr offsets_w_nulls, offsets_wo_nulls, values; + private: + Result> FromArrays(const Array& offsets, const Array& sizes, + const Array& values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + if constexpr (kTypeClassIsListView) { + return ArrayType::FromArrays(offsets, sizes, values, pool_, null_bitmap, + null_count); + } else { + return ArrayType::FromArrays(offsets, values, pool_, null_bitmap, null_count); + } + } + + void TestFromArraysWithNullBitmap() { + std::shared_ptr offsets_w_nulls, offsets_wo_nulls; + std::shared_ptr sizes_w_nulls, sizes_wo_nulls; + std::shared_ptr values; std::vector offsets = {0, 1, 1, 3, 4}; + std::vector sizes = {1, 0, 2, 1}; std::vector offsets_w_nulls_is_valid = {true, false, true, true, true}; + std::vector sizes_w_nulls_is_valid = {true, false, true, true}; ArrayFromVector(offsets_w_nulls_is_valid, offsets, &offsets_w_nulls); ArrayFromVector(offsets, &offsets_wo_nulls); + ArrayFromVector(sizes_w_nulls_is_valid, sizes, + &sizes_w_nulls); + ArrayFromVector(sizes, &sizes_wo_nulls); + auto type = std::make_shared(int32()); auto expected = std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], null, [0, null], [0]]")); @@ -214,29 +296,41 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, - ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list - ASSERT_OK_AND_ASSIGN(result, - ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap - ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap - ASSERT_RAISES(Invalid, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_RAISES(Invalid, FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); + + if constexpr (kTypeClassIsListView) { + // Sizes with nulls will match. + ASSERT_OK_AND_ASSIGN(auto result, + FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Cannot specify both null sizes with null_bitmap + ASSERT_RAISES(Invalid, FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values, + expected->null_bitmap())); + } } - void TestFromArraysWithSlicedOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedOffsets() { std::vector offsets = {-1, -1, 0, 1, 2, 4}; std::shared_ptr offsets_wo_nulls; @@ -261,7 +355,8 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArraysWithSlicedNullOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedNullOffsets() { std::vector offsets = {-1, -1, 0, 1, 1, 3}; std::vector offsets_w_nulls_is_valid = {true, true, true, false, true, true}; @@ -288,7 +383,17 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArrays() { + public: + void TestFromArraysNullHandling() { + this->TestFromArraysWithNullBitmap(); + if constexpr (!kTypeClassIsListView) { + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); + } + } + + private: + void DoTestListFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; std::vector offsets_is_valid3 = {true, false, true, true}; @@ -373,6 +478,87 @@ class TestListArray : public ::testing::Test { } } + template + std::enable_if_t DoTestListViewFromArrays() { + std::shared_ptr offsets1, offsets2; + std::shared_ptr sizes1, sizes2, sizes3, sizes4, sizes5; + std::shared_ptr values; + + std::vector sizes_is_valid3 = {true, false, true, true}; + std::vector sizes_is_valid4 = {true, true, false, true}; + std::vector sizes_is_valid5 = {true, true, false, false}; + + std::vector values_is_valid = {true, false, true, true, true, true}; + + std::vector offset1_values = {2, 0, 2}; + std::vector offset2_values = {2, 0, 6}; + std::vector size1_values = {0, 2, 4}; + std::vector size2_values = {4, 2, 0}; + + std::vector values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; + + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); + + ArrayFromVector(size1_values, &sizes1); + ArrayFromVector(size2_values, &sizes2); + ArrayFromVector(sizes_is_valid3, size1_values, &sizes3); + ArrayFromVector(sizes_is_valid4, size2_values, &sizes4); + ArrayFromVector(sizes_is_valid5, size2_values, &sizes5); + + ArrayFromVector(values_is_valid, values_values, &values); + + auto list_type = std::make_shared(int8()); + + ASSERT_OK_AND_ASSIGN(auto list_view1, + ArrayType::FromArrays(*offsets1, *sizes1, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view3, + ArrayType::FromArrays(*offsets1, *sizes3, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view4, + ArrayType::FromArrays(*offsets2, *sizes4, *values, pool_)); + ASSERT_OK(list_view1->ValidateFull()); + ASSERT_OK(list_view3->ValidateFull()); + ASSERT_OK(list_view4->ValidateFull()); + + ArrayType expected1(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, offsets1->data()->buffers[0], + 0); + AssertArraysEqual(expected1, *list_view1); + + // Use null bitmap from sizes3, but clean sizes from non-null version + ArrayType expected3(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, sizes3->data()->buffers[0], + 1); + AssertArraysEqual(expected3, *list_view3); + + ArrayType expected4(list_type, length, offsets2->data()->buffers[1], + sizes2->data()->buffers[1], values, sizes4->data()->buffers[0], + 1); + AssertArraysEqual(expected4, *list_view4); + + // Test failure modes + + std::shared_ptr tmp; + + // Zero-length offsets (not a failure mode for ListViews) + ASSERT_OK(ArrayType::FromArrays(*offsets1->Slice(0, 0), *sizes1->Slice(0, 0), *values, + pool_)); + + // Offsets not the right type + ASSERT_RAISES(TypeError, + ArrayType::FromArrays(/*offsets=*/*values, *sizes1, *values, pool_)); + } + + public: + void TestFromArrays() { + if constexpr (kTypeClassIsListView) { + DoTestListViewFromArrays(); + } else { + DoTestListFromArrays(); + } + } + void TestAppendNull() { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -420,11 +606,13 @@ class TestListArray : public ::testing::Test { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector is_valid = {1, 0, 1}; std::vector offsets = {0, 3, 3}; + std::vector sizes = {3, 0, 1}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -434,16 +622,17 @@ class TestListArray : public ::testing::Test { void TestBulkAppendInvalid() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; std::vector is_valid = {1, 0, 1}; - // Should be {0, 3, 3} given the is_valid array std::vector offsets = {0, 2, 4}; + std::vector sizes = {2, 2, 4}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -466,7 +655,12 @@ class TestListArray : public ::testing::Test { builder_.reset(checked_cast(tmp.release())); std::vector offsets = {1, 2, 4, 8}; - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + std::vector sizes = {1, 2, 4}; + if constexpr (kTypeClassIsListView) { + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), sizes.size())); + } else { + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + } std::shared_ptr list_array; ASSERT_OK(builder_->Finish(&list_array)); @@ -485,10 +679,16 @@ class TestListArray : public ::testing::Test { void TestFlattenSimple() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( - ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); + ArrayFromJSON(type, "[[], null, [1, 2], [3], [4], null, [5], [], [6]]")); ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); + + list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [1, 2], [3], [4], [], [5], [], [6]]")); + ASSERT_OK_AND_ASSIGN(flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); } void TestFlattenNulls() { @@ -500,6 +700,35 @@ class TestListArray : public ::testing::Test { AssertTypeEqual(*flattened->type(), *value_type_); } + void TestFlattenAllEmpty() { + auto type = std::make_shared(int32()); + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [], [], [], [], []]")); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))); + + if constexpr (kTypeClassIsListView) { + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")); + auto array_data = list_array->data(); + + auto offsets = array_data->buffers[1]->template mutable_data_as(); + auto sizes = array_data->buffers[2]->template mutable_data_as(); + + // Set all sizes to 0, except the one for the null entry + memset(sizes, 0, sizeof(offset_type) * array_data->length); + sizes[2] = 4; + // Make the offset of the null entry be non-zero and out of order + offsets[2] = 1; + + ASSERT_OK(list_array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))) + << flattened->ToString(); + } + } + void TestFlattenSliced() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( @@ -520,7 +749,7 @@ class TestListArray : public ::testing::Test { std::dynamic_pointer_cast( ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")) ->data(); - ASSERT_EQ(2, array_data->buffers.size()); + ASSERT_EQ(kTypeClassIsListView ? 3 : 2, array_data->buffers.size()); auto null_bitmap_buffer = array_data->buffers[0]; ASSERT_NE(nullptr, null_bitmap_buffer); bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 1); @@ -534,20 +763,47 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } - Status ValidateOffsets(int64_t length, std::vector offsets, - const std::shared_ptr& values, int64_t offset = 0) { + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, + std::vector sizes, + std::shared_ptr values, int64_t offset = 0) { auto type = std::make_shared(values->type()); - ArrayType arr(type, length, Buffer::Wrap(offsets), values, + auto offsets_buffer = Buffer::Wrap(offsets.data(), sizes.size()); + auto sizes_buffer = Buffer::Wrap(sizes); + ArrayType arr(type, length, std::move(offsets_buffer), std::move(sizes_buffer), + std::move(values), /*null_bitmap=*/nullptr, /*null_count=*/0, offset); return arr.ValidateFull(); } - void TestValidateOffsets() { + Status ValidateOffsets(int64_t length, std::vector offsets, + std::shared_ptr values, int64_t offset = 0) { + if constexpr (kTypeClassIsListView) { + std::vector sizes; + // Always reserve some space so Buffer::Wrap doesn't create a null buffer + // when length of the sizes buffer is 0. + sizes.reserve( + std::max(static_cast(1), offsets.empty() ? 0 : offsets.size() - 1)); + for (size_t i = 1; i < offsets.size(); ++i) { + sizes.push_back(offsets[i] - offsets[i - 1]); + } + return ValidateOffsetsAndSizes(length, std::move(offsets), std::move(sizes), + std::move(values), offset); + } else { + auto type = std::make_shared(values->type()); + ArrayType arr(type, length, Buffer::Wrap(offsets), std::move(values), + /*null_bitmap=*/nullptr, /*null_count=*/0, offset); + return arr.ValidateFull(); + } + } + + void TestValidateDimensions() { auto empty_values = ArrayFromJSON(int16(), "[]"); auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, 5, 6, 7]"); - // An empty list array can have omitted or 0-length offsets - ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + if constexpr (!kTypeClassIsListView) { + // An empty list array can have omitted or 0-length offsets + ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + } ASSERT_OK(ValidateOffsets(0, {0}, empty_values)); ASSERT_OK(ValidateOffsets(1, {0, 7}, values)); @@ -564,13 +820,24 @@ class TestListArray : public ::testing::Test { // Offset out of bounds ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + if constexpr (kTypeClassIsListView) { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 2)); + } else { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + } // Negative offset ASSERT_RAISES(Invalid, ValidateOffsets(1, {-1, 0}, values)); ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, -1, -1}, values, 1)); // Offsets non-monotonic ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 7, 4}, values)); + + if constexpr (kTypeClassIsListView) { + // Out of order offsets + ASSERT_OK(ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 6, 5}, values)); + + // Sizes out of bounds + ASSERT_RAISES(Invalid, ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 7, 5}, values)); + } } void TestCornerCases() { @@ -581,7 +848,7 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result_, *expected); SetUp(); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 0)); Done(); expected = ArrayFromJSON(type_, "[[]]"); AssertArraysEqual(*result_, *expected); @@ -602,7 +869,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); @@ -612,7 +879,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(vb->Append(3)); @@ -629,7 +896,7 @@ class TestListArray : public ::testing::Test { std::shared_ptr result_; }; -TYPED_TEST_SUITE(TestListArray, ListTypes); +TYPED_TEST_SUITE(TestListArray, ListAndListViewTypes); TYPED_TEST(TestListArray, Basics) { this->TestBasics(); } @@ -639,11 +906,7 @@ TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); } TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } -TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { - this->TestFromArraysWithNullBitMap(); - this->TestFromArraysWithSlicedOffsets(); - this->TestFromArraysWithSlicedNullOffsets(); -} +TYPED_TEST(TestListArray, FromArraysNullHandling) { this->TestFromArraysNullHandling(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } @@ -661,12 +924,13 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } +TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } -TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } +TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } @@ -676,6 +940,82 @@ TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck( TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } #endif +class TestListConversions : public ::testing::Test { + private: + MemoryPool* pool_; + + public: + TestListConversions() : pool_(default_memory_pool()) {} + + template + void DoTestListViewFromList() { + using DestListViewArrayClass = typename TypeTraits::ArrayType; + using SrcListArrayClass = typename TypeTraits::ArrayType; + auto list_type = std::make_shared(int32()); + auto list_view_type = std::make_shared(int32()); + + auto expected_list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_view_wo_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + std::shared_ptr list_w_nulls = + ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); + } + + template + void DoTestListFromListView() { + using SrcListViewArrayClass = typename TypeTraits::ArrayType; + using DestListArrayClass = typename TypeTraits::ArrayType; + auto list_view_type = std::make_shared(int32()); + auto list_type = std::make_shared(int32()); + + auto list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto list_view_wo_nulls = ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + auto expected_list_w_nulls = ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); + } +}; + +TEST_F(TestListConversions, ListViewFromList) { + this->DoTestListViewFromList(); + this->DoTestListViewFromList(); +} + +TEST_F(TestListConversions, ListFromListView) { + this->DoTestListFromListView(); + this->DoTestListFromListView(); +} + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/array/array_list_view_test.cc b/cpp/src/arrow/array/array_list_view_test.cc new file mode 100644 index 0000000000000..3e48191cedded --- /dev/null +++ b/cpp/src/arrow/array/array_list_view_test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/util.h" +#include "arrow/pretty_print.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// List-view array tests + +namespace { + +class TestListViewArray : public ::testing::Test { + public: + std::shared_ptr string_values; + std::shared_ptr int32_values; + std::shared_ptr int16_values; + + void SetUp() override { + string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + } + + static std::shared_ptr Offsets(std::string_view json) { + return ArrayFromJSON(int32(), json); + } + + static std::shared_ptr Sizes(std::string_view json) { + return ArrayFromJSON(int32(), json); + } +}; + +} // namespace + +TEST_F(TestListViewArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto list_view_array, + ListViewArray::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + auto array_data = list_view_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *list_view_array); + // Should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); +} + +TEST_F(TestListViewArray, FromOffsetsAndSizes) { + std::shared_ptr list_view_array; + + ASSERT_OK_AND_ASSIGN(list_view_array, ListViewArray::FromArrays( + *Offsets("[0, 0, 1, 1000]"), + *Sizes("[2, 1, 1, null]"), *int32_values)); + ASSERT_EQ(list_view_array->length(), 4); + ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *int32_values); + ASSERT_EQ(list_view_array->offset(), 0); + ASSERT_EQ(list_view_array->data()->GetNullCount(), 1); + ASSERT_EQ(list_view_array->data()->buffers.size(), 3); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index d8308c824953a..03f3e5af29908 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -27,6 +27,8 @@ #include "arrow/array/array_base.h" #include "arrow/array/array_primitive.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -38,6 +40,7 @@ #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -48,7 +51,7 @@ using internal::checked_pointer_cast; using internal::CopyBitmap; // ---------------------------------------------------------------------- -// ListArray / LargeListArray (common utilities) +// ListArray / LargeListArray / ListViewArray / LargeListViewArray (common utilities) namespace { @@ -137,6 +140,77 @@ Result::ArrayType>> ListArrayFromArray return std::make_shared(std::move(data)); } +template +Result::ArrayType>> ListViewArrayFromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + using offset_type = typename TYPE::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + if (offsets.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); + } + + if (sizes.length() != offsets.length() && sizes.length() != offsets.length() - 1) { + return Status::Invalid( + "List sizes must have the same length as offsets or one less than offsets"); + } + if (sizes.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List sizes must be ", OffsetArrowType::type_name()); + } + + if (offsets.offset() != sizes.offset()) { + return Status::Invalid("List offsets and sizes must have the same offset"); + } + const int64_t array_offset = sizes.offset(); + + if (null_bitmap) { + if (offsets.null_count() > 0 || sizes.null_count() > 0) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets or sizes with nulls"); + } + if (array_offset != 0) { + return Status::Invalid( + "List offsets and sizes must not be slices if a validity map is specified"); + } + } else { + if (offsets.null_count() > 0 && sizes.null_count() > 0) { + return Status::Invalid("Ambiguous to specify both offsets and sizes with nulls"); + } + } + + DCHECK(offsets.length() == sizes.length() || offsets.length() - 1 == sizes.length()); + + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(offsets); + const auto& typed_sizes = checked_cast(sizes); + + auto derived_validity_buffer = std::move(null_bitmap); + if (offsets.null_count() > 0) { + derived_validity_buffer = offsets.null_bitmap(); + null_count = offsets.null_count(); + // We allow construction from an offsets array containing one extra value. + // If that is the case, we might need to discount one null from out_null_count. + if (offsets.length() - 1 == sizes.length() && !offsets.IsValid(sizes.length())) { + null_count -= 1; + } + } else if (sizes.null_count() > 0) { + derived_validity_buffer = sizes.null_bitmap(); + null_count = sizes.null_count(); + } + + auto buffers = BufferVector({ + std::move(derived_validity_buffer), + typed_offsets.values(), + typed_sizes.values(), + }); + auto data = ArrayData::Make(type, sizes.length(), std::move(buffers), {values.data()}, + null_count, array_offset); + return std::make_shared(std::move(data)); +} + static std::shared_ptr SliceArrayWithOffsets(const Array& array, int64_t begin, int64_t end) { return array.Slice(begin, end - begin); @@ -189,23 +263,199 @@ Result> FlattenListArray(const ListArrayT& list_array, return Concatenate(non_null_fragments, memory_pool); } +template +Result> FlattenListViewArray(const ListViewArrayT& list_view_array, + MemoryPool* memory_pool) { + using offset_type = typename ListViewArrayT::offset_type; + const int64_t list_view_array_offset = list_view_array.offset(); + const int64_t list_view_array_length = list_view_array.length(); + std::shared_ptr value_array = list_view_array.values(); + + if (list_view_array_length == 0) { + return SliceArrayWithOffsets(*value_array, 0, 0); + } + + // If the list array is *all* nulls, then just return an empty array. + if constexpr (HasNulls) { + if (list_view_array.null_count() == list_view_array.length()) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + } + + const auto* validity = list_view_array.data()->template GetValues(0, 0); + const auto* offsets = list_view_array.data()->template GetValues(1); + const auto* sizes = list_view_array.data()->template GetValues(2); + + auto is_null_or_empty = [&](int64_t i) { + if constexpr (HasNulls) { + if (!bit_util::GetBit(validity, list_view_array_offset + i)) { + return true; + } + } + return sizes[i] == 0; + }; + + // Index of the first valid, non-empty list-view. + int64_t first_i = 0; + for (; first_i < list_view_array_length; first_i++) { + if (!is_null_or_empty(first_i)) { + break; + } + } + // If all list-views are empty, return an empty array. + if (first_i == list_view_array_length) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + + std::vector> slices; + { + int64_t i = first_i; + auto begin_offset = offsets[i]; + auto end_offset = offsets[i] + sizes[i]; + i += 1; + // Inductive invariant: slices and the always non-empty values slice + // [begin_offset, end_offset) contains all the maximally contiguous slices of the + // values array that are covered by all the list-views before list-view i. + for (; i < list_view_array_length; i++) { + if (is_null_or_empty(i)) { + // The invariant is preserved by simply preserving the current set of slices. + } else { + if (offsets[i] == end_offset) { + end_offset += sizes[i]; + // The invariant is preserved because since the non-empty list-view i + // starts at end_offset, the current range can be extended to end at + // offsets[i] + sizes[i] (the same as end_offset + sizes[i]). + } else { + // The current slice can't be extended because the list-view i either + // shares values with the current slice or starts after the position + // immediately after the end of the current slice. + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + begin_offset = offsets[i]; + end_offset = offsets[i] + sizes[i]; + // The invariant is preserved because a maximally contiguous slice of + // the values array (i.e. one that can't be extended) was added to slices + // and [begin_offset, end_offset) is non-empty and contains the + // current list-view i. + } + } + } + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + } + + // Final attempt to avoid invoking Concatenate(). + switch (slices.size()) { + case 0: + return MakeEmptyArray(value_array->type(), memory_pool); + case 1: + return slices[0]; + } + + return Concatenate(slices, memory_pool); +} + std::shared_ptr BoxOffsets(const std::shared_ptr& boxed_type, const ArrayData& data) { + const int64_t num_offsets = + is_list_view(data.type->id()) ? data.length : data.length + 1; std::vector> buffers = {nullptr, data.buffers[1]}; auto offsets_data = - std::make_shared(boxed_type, data.length + 1, std::move(buffers), + std::make_shared(boxed_type, /*length=*/num_offsets, std::move(buffers), /*null_count=*/0, data.offset); return MakeArray(offsets_data); } +std::shared_ptr BoxSizes(const std::shared_ptr& boxed_type, + const ArrayData& data) { + DCHECK(is_list_view(data.type->id())); + std::vector> buffers = {nullptr, data.buffers[2]}; + auto sizes_data = + std::make_shared(boxed_type, data.length, std::move(buffers), + /*null_count=*/0, data.offset); + return MakeArray(sizes_data); +} + +template +Result> ListViewFromListImpl( + const std::shared_ptr& list_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename SrcListType::offset_type; + const auto& list_type = checked_cast(*list_data->type); + + // To re-use the validity and offsets buffers, a sizes buffer with enough + // padding on the beginning is allocated and filled with the sizes after + // list_data->offset. + const int64_t buffer_length = list_data->offset + list_data->length; + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + AllocateBuffer(buffer_length * sizeof(offset_type), pool)); + const auto* offsets = list_data->template GetValues(1, 0); + auto* sizes = sizes_buffer->mutable_data_as(); + // Zero the initial padding area to avoid leaking any data when buffers are + // sent over IPC or throught the C Data interface. + memset(sizes, 0, list_data->offset * sizeof(offset_type)); + for (int64_t i = list_data->offset; i < buffer_length; i++) { + sizes[i] = offsets[i + 1] - offsets[i]; + } + BufferVector buffers = {list_data->buffers[0], list_data->buffers[1], + std::move(sizes_buffer)}; + + return ArrayData::Make(std::make_shared(list_type.value_type()), + list_data->length, std::move(buffers), + {list_data->child_data[0]}, list_data->null_count, + list_data->offset); +} + +template +Result> ListFromListViewImpl( + const std::shared_ptr& list_view_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename DestListType::offset_type; + using ListBuilderType = typename TypeTraits::BuilderType; + + const auto& list_view_type = + checked_cast(*list_view_data->type); + const auto& value_type = list_view_type.value_type(); + const auto list_type = std::make_shared(value_type); + + ARROW_ASSIGN_OR_RAISE(auto sum_of_list_view_sizes, + list_util::internal::SumOfLogicalListSizes(*list_view_data)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, + MakeBuilder(value_type, pool)); + RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes)); + auto list_builder = std::make_shared(pool, value_builder, list_type); + RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); + + ArraySpan values{*list_view_data->child_data[0]}; + const auto* in_validity_bitmap = list_view_data->GetValues(0); + const auto* in_offsets = list_view_data->GetValues(1); + const auto* in_sizes = list_view_data->GetValues(2); + for (int64_t i = 0; i < list_view_data->length; ++i) { + const bool is_valid = + !in_validity_bitmap || + bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i); + const int64_t size = is_valid ? in_sizes[i] : 0; + RETURN_NOT_OK(list_builder->Append(is_valid, size)); + RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], size)); + } + std::shared_ptr list_array_data; + RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data)); + return list_array_data; +} + } // namespace namespace internal { template -inline void SetListData(BaseListArray* self, const std::shared_ptr& data, +inline void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id) { - ARROW_CHECK_EQ(data->buffers.size(), 2); + ARROW_CHECK_EQ(data->buffers.size(), is_list_view(TYPE::type_id) ? 3 : 2); ARROW_CHECK_EQ(data->type->id(), expected_type_id); ARROW_CHECK_EQ(data->child_data.size(), 1); @@ -214,6 +464,7 @@ inline void SetListData(BaseListArray* self, const std::shared_ptrlist_type_ = checked_cast(data->type.get()); self->raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); + // BaseListViewArray::SetData takes care of setting raw_value_sizes_. ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); @@ -225,7 +476,9 @@ inline void SetListData(BaseListArray* self, const std::shared_ptr data) { SetData(std::move(data)); } +ListArray::ListArray(std::shared_ptr data) { + ListArray::SetData(std::move(data)); +} ListArray::ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -250,6 +503,13 @@ Result> ListArray::FromArrays( values, pool, null_bitmap, null_count); } +Result> ListArray::FromListView(const ListViewArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> ListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -273,7 +533,9 @@ std::shared_ptr ListArray::offsets() const { return BoxOffsets(int32(), * // ---------------------------------------------------------------------- // LargeListArray -LargeListArray::LargeListArray(const std::shared_ptr& data) { SetData(data); } +LargeListArray::LargeListArray(const std::shared_ptr& data) { + LargeListArray::SetData(data); +} LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -284,7 +546,7 @@ LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t le auto internal_data = ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); internal_data->child_data.emplace_back(values->data()); - SetData(internal_data); + LargeListArray::SetData(internal_data); } void LargeListArray::SetData(const std::shared_ptr& data) { @@ -299,6 +561,14 @@ Result> LargeListArray::FromArrays( null_count); } +Result> LargeListArray::FromListView( + const LargeListViewArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> LargeListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -321,6 +591,144 @@ std::shared_ptr LargeListArray::offsets() const { return BoxOffsets(int64(), *data_); } +// ---------------------------------------------------------------------- +// ListViewArray + +ListViewArray::ListViewArray(std::shared_ptr data) { + ListViewArray::SetData(std::move(data)); +} + +ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + ListViewArray::SetData(ArrayData::Make( + std::move(type), length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void ListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> ListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> ListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LIST_VIEW) { + return Status::TypeError("Expected list-view type, got ", type->ToString()); + } + const auto& list_view_type = checked_cast(*type); + if (!list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching list-view value type"); + } + return ListViewArrayFromArrays(std::move(type), offsets, sizes, values, + pool, null_bitmap, null_count); +} + +Result> ListViewArray::FromList(const ListArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> LargeListViewArray::FromList( + const LargeListArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> ListViewArray::Flatten(MemoryPool* memory_pool) const { + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); +} + +std::shared_ptr ListViewArray::offsets() const { + return BoxOffsets(int32(), *data_); +} + +std::shared_ptr ListViewArray::sizes() const { return BoxSizes(int32(), *data_); } + +// ---------------------------------------------------------------------- +// LargeListViewArray + +LargeListViewArray::LargeListViewArray(std::shared_ptr data) { + LargeListViewArray::SetData(std::move(data)); +} + +LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, + int64_t null_count, int64_t offset) { + LargeListViewArray::SetData(ArrayData::Make( + type, length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void LargeListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> LargeListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> LargeListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LARGE_LIST_VIEW) { + return Status::TypeError("Expected large list-view type, got ", type->ToString()); + } + const auto& large_list_view_type = checked_cast(*type); + if (!large_list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching large list-view value type"); + } + return ListViewArrayFromArrays( + std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); +} + +Result> LargeListViewArray::Flatten( + MemoryPool* memory_pool) const { + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); +} + +std::shared_ptr LargeListViewArray::offsets() const { + return BoxOffsets(int64(), *data_); +} + +std::shared_ptr LargeListViewArray::sizes() const { + return BoxSizes(int64(), *data_); +} + // ---------------------------------------------------------------------- // MapArray diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 8d5cc95fec00d..61606e1592d61 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and -// Union +// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList, +// Map, Struct, and Union #pragma once @@ -43,30 +43,31 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// ListArray +// VarLengthListLikeArray template -class BaseListArray; +class VarLengthListLikeArray; namespace internal { -// Private helper for ListArray::SetData. -// Unfortunately, trying to define BaseListArray::SetData outside of this header +// Private helper for [Large]List[View]Array::SetData. +// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header // doesn't play well with MSVC. template -void SetListData(BaseListArray* self, const std::shared_ptr& data, +void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); } // namespace internal -/// Base class for variable-sized list arrays, regardless of offset size. +/// Base class for variable-sized list and list-view arrays, regardless of offset size. template -class BaseListArray : public Array { +class VarLengthListLikeArray : public Array { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; - const TypeClass* list_type() const { return list_type_; } + const TypeClass* var_length_list_like_type() const { return this->list_type_; } /// \brief Return array object containing the list's values /// @@ -84,19 +85,26 @@ class BaseListArray : public Array { } // The following functions will not perform boundschecking + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - offset_type value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists and list-views are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + virtual offset_type value_length(int64_t i) const = 0; + + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: - friend void internal::SetListData(BaseListArray* self, + friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id); @@ -105,6 +113,29 @@ class BaseListArray : public Array { const offset_type* raw_value_offsets_ = NULLPTR; }; +// ---------------------------------------------------------------------- +// ListArray / LargeListArray + +template +class BaseListArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_type() const { return this->var_length_list_like_type(); } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + i += this->data_->offset; + return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; + } +}; + /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: @@ -120,10 +151,13 @@ class ARROW_EXPORT ListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. /// - /// Offsets of an Array's null bitmap can be present or an explicit - /// null_bitmap, but not both. + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type @@ -143,6 +177,10 @@ class ARROW_EXPORT ListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a ListArray from a ListViewArray + static Result> FromListView(const ListViewArray& source, + MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -181,7 +219,13 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. + /// + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int64 type @@ -201,6 +245,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a LargeListArray from a LargeListViewArray + static Result> FromListView( + const LargeListViewArray& source, MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -216,6 +264,211 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { void SetData(const std::shared_ptr& data); }; +// ---------------------------------------------------------------------- +// ListViewArray / LargeListViewArray + +template +class BaseListViewArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } + + /// \brief Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } + + /// \brief Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_sizes() const { + return raw_value_sizes_ + this->data_->offset; + } + + /// \brief Return the size of the value at a particular index + /// + /// This should not be called if the list-view at slot i is null. + /// The returned size in those cases could be any value from 0 to the + /// length of the child values array. + /// + /// \pre IsValid(i) + offset_type value_length(int64_t i) const final { + return this->raw_value_sizes_[i + this->data_->offset]; + } + + protected: + const offset_type* raw_value_sizes_ = NULLPTR; +}; + +/// \brief Concrete Array class for list-view data +class ARROW_EXPORT ListViewArray : public BaseListViewArray { + public: + explicit ListViewArray(std::shared_ptr data); + + ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct a ListViewArray using buffers from offsets and sizes arrays + /// that project views into the child values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a ListViewArray from a ListArray + static Result> FromList(const ListArray& list_array, + MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the list-views in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// + /// This function invokes Concatenate() if list-views are non-contiguous. It + /// will try to minimize the number of array slices passed to Concatenate() by + /// maximizing the size of each slice (containing as many contiguous + /// list-views as possible). + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + ListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// \brief Concrete Array class for large list-view data (with 64-bit offsets +/// and sizes) +class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { + public: + explicit LargeListViewArray(std::shared_ptr data); + + LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct an LargeListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or + /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a + /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. + /// + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). + /// + /// \param[in] offsets An array of int64 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Build a LargeListViewArray from a LargeListArray + static Result> FromList( + const LargeListArray& list_array, MemoryPool* pool); + + /// \brief Return an Array that is a concatenation of the large list-views in this + /// array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + LargeListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + // ---------------------------------------------------------------------- // MapArray @@ -319,10 +572,18 @@ class ARROW_EXPORT FixedSizeListArray : public Array { i += data_->offset; return list_size_ * i; } + /// \brief Return the fixed-size of the values + /// + /// No matter the value of the index parameter, the result is the same. + /// So even when the value at slot i is null, this function will return a + /// non-zero size. + /// + /// \pre IsValid(i) int32_t value_length(int64_t i = 0) const { ARROW_UNUSED(i); return list_size_; } + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 46908439ef5f0..974eb54d2caca 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -80,6 +80,22 @@ class TestArray : public ::testing::Test { MemoryPool* pool_; }; +void CheckDictionaryNullCount(const std::shared_ptr& dict_type, + const std::string& input_dictionary_json, + const std::string& input_index_json, + const int64_t& expected_null_count, + const int64_t& expected_logical_null_count, + bool expected_may_have_nulls, + bool expected_may_have_logical_nulls) { + std::shared_ptr arr = + DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); + + ASSERT_EQ(arr->null_count(), expected_null_count); + ASSERT_EQ(arr->ComputeLogicalNullCount(), expected_logical_null_count); + ASSERT_EQ(arr->data()->MayHaveNulls(), expected_may_have_nulls); + ASSERT_EQ(arr->data()->MayHaveLogicalNulls(), expected_may_have_logical_nulls); +} + TEST_F(TestArray, TestNullCount) { // These are placeholders auto data = std::make_shared(nullptr, 0); @@ -127,6 +143,37 @@ TEST_F(TestArray, TestNullCount) { ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount()); ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls()); ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls()); + + // Dictionary type + std::shared_ptr type; + std::shared_ptr dict_type; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + type = boolean(); + dict_type = dictionary(index_type, type); + // no null value + CheckDictionaryNullCount(dict_type, "[]", "[]", 0, 0, false, false); + CheckDictionaryNullCount(dict_type, "[true, false]", "[0, 1, 0]", 0, 0, false, false); + + // only indices contain null value + CheckDictionaryNullCount(dict_type, "[true, false]", "[null, 0, 1]", 1, 1, true, + true); + CheckDictionaryNullCount(dict_type, "[true, false]", "[null, null]", 2, 2, true, + true); + + // only dictionary contains null value + CheckDictionaryNullCount(dict_type, "[null, true]", "[]", 0, 0, false, true); + CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0]", 0, 2, false, + true); + + // both indices and dictionary contain null value + CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0, null]", 1, 3, + true, true); + CheckDictionaryNullCount(dict_type, "[null, true, null, false]", "[null, 1, 0, 2, 3]", + 1, 3, true, true); + } } TEST_F(TestArray, TestSlicePreservesAllNullCount) { @@ -137,6 +184,16 @@ TEST_F(TestArray, TestSlicePreservesAllNullCount) { Int32Array arr(/*length=*/100, data, null_bitmap, /*null_count*/ 100); EXPECT_EQ(arr.Slice(1, 99)->data()->null_count, arr.Slice(1, 99)->length()); + + // Dictionary type + std::shared_ptr dict_type = dictionary(int64(), boolean()); + std::shared_ptr dict_arr = + DictArrayFromJSON(dict_type, /*indices=*/"[null, 0, 0, 0, 0, 0, 1, 2, 0, 0]", + /*dictionary=*/"[null, true, false]"); + ASSERT_EQ(dict_arr->null_count(), 1); + ASSERT_EQ(dict_arr->ComputeLogicalNullCount(), 8); + ASSERT_EQ(dict_arr->Slice(2, 8)->null_count(), 0); + ASSERT_EQ(dict_arr->Slice(2, 8)->ComputeLogicalNullCount(), 6); } TEST_F(TestArray, TestLength) { @@ -398,6 +455,8 @@ static std::vector> TestArrayUtilitiesAgainstTheseType large_list(list(large_utf8())), fixed_size_list(utf8(), 3), fixed_size_list(int64(), 4), + list_view(utf8()), + large_list_view(utf8()), dictionary(int32(), utf8()), struct_({field("a", utf8()), field("b", int32())}), sparse_union(union_fields1, union_type_codes), @@ -616,6 +675,8 @@ static ScalarVector GetScalars() { ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3, 4]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared( ScalarVector{ std::make_shared(2), @@ -752,9 +813,9 @@ TEST_F(TestArray, TestFillFromScalar) { ArraySpan span(*scalar); auto roundtripped_array = span.ToArray(); - AssertArraysEqual(*array, *roundtripped_array); - ASSERT_OK(roundtripped_array->ValidateFull()); + + AssertArraysEqual(*array, *roundtripped_array); ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); AssertScalarsEqual(*scalar, *roundtripped_scalar); } @@ -3526,6 +3587,8 @@ DataTypeVector SwappableTypes() { large_utf8(), list(int16()), large_list(int16()), + list_view(int16()), + large_list_view(int16()), dictionary(int16(), utf8())}; } diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index d3502a0ab645a..40e705aa3e440 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -150,7 +150,8 @@ struct AppendScalarImpl { } template - enable_if_list_like Visit(const T&) { + enable_if_t::value || is_list_like_type::value, Status> Visit( + const T&) { auto builder = checked_cast::BuilderType*>(builder_); int64_t num_children = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { @@ -162,8 +163,12 @@ struct AppendScalarImpl { for (int64_t i = 0; i < n_repeats_; i++) { for (auto it = scalars_begin_; it != scalars_end_; ++it) { if (it->is_valid) { - RETURN_NOT_OK(builder->Append()); const Array& list = *checked_cast(*it).value; + if constexpr (T::type_id == Type::MAP || T::type_id == Type::FIXED_SIZE_LIST) { + RETURN_NOT_OK(builder->Append()); + } else { + RETURN_NOT_OK(builder->Append(/*is_valid=*/true, list.length())); + } for (int64_t i = 0; i < list.length(); i++) { ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index fbba1fd056430..5bdc76d96c8f0 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -30,6 +30,20 @@ namespace arrow { +// ---------------------------------------------------------------------- +// VarLengthListLikeBuilder / BaseListBuilder / BaseListViewBuilder + +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; + +template class BaseListBuilder; +template class BaseListBuilder; + +template class BaseListViewBuilder; +template class BaseListViewBuilder; + // ---------------------------------------------------------------------- // MapBuilder diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d0b17c230489b..21c2d4b270eb1 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -40,37 +40,46 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// List builder +// VarLengthListLikeBuilder template -class BaseListBuilder : public ArrayBuilder { +class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type, - int64_t alignment = kDefaultBufferAlignment) + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), offsets_builder_(pool, alignment), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - int64_t alignment = kDefaultBufferAlignment) - : BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {} + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : VarLengthListLikeBuilder(pool, value_builder, + std::make_shared(value_builder->type()), + alignment) {} + + ~VarLengthListLikeBuilder() override = default; Status Resize(int64_t capacity) override { if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { - return Status::CapacityError("List array cannot reserve space for more than ", + return Status::CapacityError(type_name(), + " array cannot reserve space for more than ", maximum_elements(), " got ", capacity); } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); - // One more than requested for offsets - ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + // One more than requested for list offsets + const int64_t offsets_capacity = + is_list_view(TYPE::type_id) ? capacity : capacity + 1; + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity)); return ArrayBuilder::Resize(capacity); } @@ -80,56 +89,98 @@ class BaseListBuilder : public ArrayBuilder { value_builder_->Reset(); } - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const offset_type* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); - } - /// \brief Start a new variable-length list slot /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true) { + /// This function should be called before appending elements to the + /// value builder. Elements appended to the value builder before this function + /// is called for the first time, will not be members of any list value. + /// + /// After this function is called, list_length elements SHOULD be appended to + /// the values builder. If this contract is violated, the behavior is defined by + /// the concrete builder implementation and SHOULD NOT be relied upon unless + /// the caller is specifically building a [Large]List or [Large]ListView array. + /// + /// For [Large]List arrays, the list slot length will be the number of elements + /// appended to the values builder before the next call to Append* or Finish. For + /// [Large]ListView arrays, the list slot length will be exactly list_length, but if + /// Append* is called before at least list_length elements are appended to the values + /// builder, the current list slot will share elements with the next list + /// slots or an invalid [Large]ListView array will be generated because there + /// aren't enough elements in the values builder to fill the list slots. + /// + /// If you're building a [Large]List and don't need to be compatible + /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)` + /// is a simpler API. + /// + /// \pre if is_valid is false, list_length MUST be 0 + /// \param is_valid Whether the new list slot is valid + /// \param list_length The number of elements in the list + Status Append(bool is_valid, int64_t list_length) { ARROW_RETURN_NOT_OK(Reserve(1)); + assert(is_valid || list_length == 0); UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); return Status::OK(); } - Status AppendNull() final { return Append(false); } + Status AppendNull() final { + // Append() a null list slot with list_length=0. + // + // When building [Large]List arrays, elements being appended to the values builder + // before the next call to Append* or Finish will extend the list slot length, but + // that is totally fine because list arrays admit non-empty null list slots. + // + // In the case of [Large]ListViews that's not a problem either because the + // list slot length remains zero. + return Append(false, 0); + } Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, false); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } - Status AppendEmptyValue() final { return Append(true); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure list slot remains empty + Status AppendEmptyValue() final { return Append(true, 0); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure the last list slot remains empty Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, true); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } + /// \brief Vector append + /// + /// For list-array builders, the sizes are inferred from the offsets. + /// BaseListBuilder provides an implementation that doesn't take sizes, but + /// this virtual function allows dispatching calls to both list-array and + /// list-view-array builders (which need the sizes) + /// + /// \param offsets The offsets of the variable-length lists + /// \param sizes The sizes of the variable-length lists + /// \param length The number of offsets, sizes, and validity bits to append + /// \param valid_bytes If passed, valid_bytes is of equal length to values, + /// and any zero byte will be considered as a null for that slot + virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) = 0; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); + [[maybe_unused]] const offset_type* sizes = NULLPTR; + if constexpr (is_list_view(TYPE::type_id)) { + sizes = array.GetValues(2); + } const bool all_valid = !array.MayHaveLogicalNulls(); const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); @@ -137,43 +188,28 @@ class BaseListBuilder : public ArrayBuilder { const bool is_valid = all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || array.IsValid(row); + int64_t size = 0; + if (is_valid) { + if constexpr (is_list_view(TYPE::type_id)) { + size = sizes[row]; + } else { + size = offsets[row + 1] - offsets[row]; + } + } UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); if (is_valid) { - int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], - offsets[row], slot_length)); + ARROW_RETURN_NOT_OK( + value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); } } return Status::OK(); } - Status FinishInternal(std::shared_ptr* out) override { - ARROW_RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets, null_bitmap; - ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); - } - - std::shared_ptr items; - ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - - *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, - null_count_); - Reset(); - return Status::OK(); - } - Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { - return Status::CapacityError("List array cannot contain more than ", + return Status::CapacityError(type_name(), " array cannot contain more than ", maximum_elements(), " elements, have ", new_elements); } else { return Status::OK(); @@ -191,20 +227,136 @@ class BaseListBuilder : public ArrayBuilder { return std::make_shared(value_field_->WithType(value_builder_->type())); } + private: + static constexpr const char* type_name() { + if constexpr (is_list_view(TYPE::type_id)) { + return "ListView"; + } else { + return "List"; + } + } + protected: + /// \brief Append dimensions for num_values empty list slots. + /// + /// ListViewBuilder overrides this to also append the sizes. + virtual void UnsafeAppendEmptyDimensions(int64_t num_values) { + const int64_t offset = value_builder_->length(); + for (int64_t i = 0; i < num_values; ++i) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + } + + /// \brief Append dimensions for a single list slot. + /// + /// ListViewBuilder overrides this to also append the size. + virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + TypedBufferBuilder offsets_builder_; std::shared_ptr value_builder_; std::shared_ptr value_field_; +}; + +// ---------------------------------------------------------------------- +// ListBuilder / LargeListBuilder + +template +class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + using BASE::Append; + + ~BaseListBuilder() override = default; + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true) { + // The value_length parameter to BASE::Append(bool, int64_t) is ignored when + // building a list array, so we can pass 0 here. + return BASE::Append(is_valid, 0); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + // Offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to partially rule out the possibility that the + // caller is passing sizes that could work if building a list-view, but don't + // work on building a list that requires offsets to be non-decreasing. + // + // CAUTION: the last size element (`sizes[length - 1]`) is not + // validated and could be inconsistent with the offsets given in a + // subsequent call to AppendValues. +#ifndef NDEBUG + if (sizes) { + for (int64_t i = 0; i < length - 1; ++i) { + if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { + if (!valid_bytes || valid_bytes[i]) { + return Status::Invalid( + "BaseListBuilder: sizes are inconsistent with offsets provided"); + } + } + } + } +#endif + return AppendValues(offsets, length, valid_bytes); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); - const int64_t num_values = value_builder_->length(); - return offsets_builder_.Append(static_cast(num_values)); + ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); + const int64_t num_values = this->value_builder_->length(); + return this->offsets_builder_.Append(static_cast(num_values)); } - void UnsafeAppendNextOffset() { - const int64_t num_values = value_builder_->length(); - offsets_builder_.UnsafeAppend(static_cast(num_values)); + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); } }; @@ -247,6 +399,116 @@ class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; +// ---------------------------------------------------------------------- +// ListViewBuilder / LargeListViewBuilder + +template +class ARROW_EXPORT BaseListViewBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + ~BaseListViewBuilder() override = default; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); + return sizes_builder_.Resize(capacity); + } + + void Reset() override { + BASE::Reset(); + sizes_builder_.Reset(); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + this->sizes_builder_.UnsafeAppend(sizes, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Offset and sizes padding zeroed by BufferBuilder + std::shared_ptr null_bitmap; + std::shared_ptr offsets; + std::shared_ptr sizes; + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets), std::move(sizes)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } + + protected: + void UnsafeAppendEmptyDimensions(int64_t num_values) override { + for (int64_t i = 0; i < num_values; ++i) { + this->offsets_builder_.UnsafeAppend(0); + } + for (int64_t i = 0; i < num_values; ++i) { + this->sizes_builder_.UnsafeAppend(0); + } + } + + void UnsafeAppendDimensions(int64_t offset, int64_t size) override { + this->offsets_builder_.UnsafeAppend(static_cast(offset)); + this->sizes_builder_.UnsafeAppend(static_cast(size)); + } + + private: + TypedBufferBuilder sizes_builder_; +}; + +class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +class ARROW_EXPORT LargeListViewBuilder final + : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + // ---------------------------------------------------------------------- // Map builder diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 37c7271b5b95c..ff9ed66d1149f 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -35,14 +35,17 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/int_util_overflow.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/slice_util_internal.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" @@ -98,10 +101,18 @@ Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, return Status::OK(); } +int64_t SumBufferSizesInBytes(const BufferVector& buffers) { + int64_t size = 0; + for (const auto& buffer : buffers) { + size += buffer->size(); + } + return size; +} + // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, @@ -113,33 +124,30 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, values_ranges->resize(buffers.size()); // allocate output buffer - int64_t out_length = 0; - for (const auto& buffer : buffers) { - out_length += buffer->size() / sizeof(Offset); - } - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer((out_length + 1) * sizeof(Offset), pool)); - auto dst = reinterpret_cast((*out)->mutable_data()); + const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size_in_bytes, pool)); + auto* out_data = (*out)->mutable_data_as(); int64_t elements_length = 0; Offset values_length = 0; for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(buffers[i], values_length, &dst[elements_length], - &(*values_ranges)[i])); + RETURN_NOT_OK(PutOffsets(*buffers[i], values_length, + out_data + elements_length, &(*values_ranges)[i])); elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); } - // the final element in dst is the length of all values spanned by the offsets - dst[out_length] = values_length; + // the final element in out_data is the length of all values spanned by the offsets + out_data[out_size_in_bytes / sizeof(Offset)] = values_length; return Status::OK(); } template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range) { - if (src->size() == 0) { + if (src.size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) values_range->offset = 0; @@ -148,8 +156,8 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse } // Get the range of offsets to transfer from src - auto src_begin = reinterpret_cast(src->data()); - auto src_end = reinterpret_cast(src->data() + src->size()); + auto src_begin = src.data_as(); + auto src_end = reinterpret_cast(src.data() + src.size()); // Compute the range of values which is spanned by this range of offsets values_range->offset = src_begin[0]; @@ -160,16 +168,132 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse // Write offsets into dst, ensuring that the first offset written is // first_offset - auto adjustment = first_offset - src_begin[0]; + auto displacement = first_offset - src_begin[0]; // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [adjustment](Offset offset) { - return SafeSignedAdd(offset, adjustment); + std::transform(src_begin, src_end, dst, [displacement](Offset offset) { + return SafeSignedAdd(offset, displacement); }); return Status::OK(); } +template +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst); + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// value_ranges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +// +// The child arrays and the sizes buffer are used to ensure we can trust the offsets in +// offset_buffers to be within the valid range. +// +// This function also mutates sizes so that null list-view entries have size 0. +// +// \param[in] in The child arrays +// \param[in,out] sizes The concatenated sizes buffer +template +Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, + const BufferVector& offset_buffers, + const std::vector& value_ranges, + MemoryPool* pool, std::shared_ptr* out) { + DCHECK_EQ(offset_buffers.size(), value_ranges.size()); + + // Allocate resulting offsets buffer and initialize it with zeros + const int64_t out_size_in_bytes = SumBufferSizesInBytes(offset_buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); + memset((*out)->mutable_data(), 0, static_cast((*out)->size())); + + auto* out_offsets = (*out)->mutable_data_as(); + + int64_t num_child_values = 0; + int64_t elements_length = 0; + for (size_t i = 0; i < offset_buffers.size(); ++i) { + const auto displacement = + static_cast(num_child_values - value_ranges[i].offset); + RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + elements_length += offset_buffers[i]->size() / sizeof(offset_type); + num_child_values += value_ranges[i].length; + if (num_child_values > std::numeric_limits::max()) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + } + DCHECK_EQ(elements_length, + static_cast(out_size_in_bytes / sizeof(offset_type))); + + return Status::OK(); +} + +template +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst) { + if (src.size() == 0) { + return Status::OK(); + } + const auto& validity_buffer = input.buffers[0]; + if (validity_buffer) { + // Ensure that it is safe to access all the bits in the validity bitmap of input. + RETURN_NOT_OK(internal::CheckSliceParams(/*size=*/8 * validity_buffer->size(), + input.offset, input.length, "buffer")); + } + + const auto offsets = src.data_as(); + DCHECK_EQ(static_cast(src.size() / sizeof(offset_type)), input.length); + + auto visit_not_null = [&](int64_t position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave the dst[position] as 0. + } + }; + + const auto* validity = validity_buffer ? validity_buffer->data_as() : nullptr; + internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); + int64_t position = 0; + while (position < input.length) { + internal::BitBlockCount block = bit_counter.NextBlock(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + visit_not_null(position); + } + } else if (block.NoneSet()) { + // NOTE: we don't have to do anything for the null entries regarding the + // offsets as the buffer is initialized to 0 when it is allocated. + + // Zero-out the sizes of the null entries to ensure these sizes are not + // greater than the new values length of the concatenated array. + memset(sizes + position, 0, block.length * sizeof(offset_type)); + position += block.length; + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(validity, input.offset + position)) { + visit_not_null(position); + } else { + // Zero-out the size at position. + sizes[position] = 0; + } + } + } + } + return Status::OK(); +} + class ConcatenateImpl { public: ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool) @@ -288,6 +412,41 @@ class ConcatenateImpl { return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); } + template + enable_if_list_view Visit(const T& type) { + using offset_type = typename T::offset_type; + out_->buffers.resize(3); + out_->child_data.resize(1); + + // Calculate the ranges of values that each list-view array uses + std::vector value_ranges; + value_ranges.reserve(in_.size()); + for (const auto& input : in_) { + ArraySpan input_span(*input); + Range range; + ARROW_ASSIGN_OR_RAISE(std::tie(range.offset, range.length), + list_util::internal::RangeOfValuesUsed(input_span)); + value_ranges.push_back(range); + } + + // Concatenate the values + ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, value_ranges)); + RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); + out_->child_data[0]->type = type.value_type(); + + // Concatenate the sizes first + ARROW_ASSIGN_OR_RAISE(auto size_buffers, Buffers(2, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateBuffers(size_buffers, pool_).Value(&out_->buffers[2])); + + // Concatenate the offsets + ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateListViewOffsets( + in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), offset_buffers, + value_ranges, pool_, &out_->buffers[1])); + + return Status::OK(); + } + Status Visit(const FixedSizeListType& fixed_size_list) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size())); return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0ef1136ea78f8..af595e897f9ee 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -40,26 +41,55 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/list_util.h" namespace arrow { -class ConcatenateTest : public ::testing::Test { - protected: - ConcatenateTest() - : rng_(seed_), - sizes_({0, 1, 2, 4, 16, 31, 1234}), - null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} +class SimpleRandomArrayGenerator { + private: + random::SeedType seed_ = 0xdeadbeef; + std::default_random_engine random_engine_; + random::RandomArrayGenerator rag_; + + public: + SimpleRandomArrayGenerator() : random_engine_(seed_), rag_(seed_) {} + + template + std::vector RandomOffsetsInRange(offset_type min_offset, + offset_type max_offset, + int64_t num_offsets) { + std::vector offsets(static_cast(num_offsets)); + std::uniform_int_distribution dist(min_offset, max_offset); + std::generate(offsets.begin(), offsets.end(), [&] { return dist(random_engine_); }); + return offsets; + } - template - std::vector Offsets(int32_t length, int32_t slice_count) { - std::vector offsets(static_cast(slice_count + 1)); - std::default_random_engine gen(seed_); - std::uniform_int_distribution dist(0, length); - std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); + template + std::vector Offsets(int32_t values_length, int32_t slice_count) { + auto offsets = RandomOffsetsInRange(0, values_length, slice_count + 1); std::sort(offsets.begin(), offsets.end()); return offsets; } + /// \param[in] random_offsets Random offsets in [0, values_size] and no particular order + template + std::vector ListViewSizes(const std::vector& random_offsets, + int64_t values_size, double avg_size, + int64_t num_sizes) { + std::normal_distribution normal(/*mean=*/avg_size, /*stddev=*/3.0); + std::vector sizes; + sizes.reserve(num_sizes); + for (int64_t i = 0; i < num_sizes; ++i) { + const auto sampled_size = std::llround(normal(random_engine_)); + auto size = std::max(0, static_cast(sampled_size)); + if (random_offsets[i] > values_size - size) { + size = static_cast(values_size - random_offsets[i]); + } + sizes.push_back(size); + } + return sizes; + } + ArrayVector Slices(const std::shared_ptr& array, const std::vector& offsets) { ArrayVector slices(offsets.size() - 1); @@ -69,33 +99,119 @@ class ConcatenateTest : public ::testing::Test { return slices; } + std::shared_ptr ValidityBitmap(int64_t size, double null_probability) { + return rag_.NullBitmap(size, null_probability, kDefaultBufferAlignment, + default_memory_pool()); + } + template - std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { + std::shared_ptr PrimitiveArray(int64_t size, double null_probability) { if (std::is_same::value) { - return rng_.Boolean(size, 0.5, null_probability); + return rag_.Boolean(size, 0.5, null_probability); } - return rng_.Numeric(size, 0, 127, null_probability); + return rag_.Numeric(size, 0, 127, null_probability); + } + + std::shared_ptr StringArray(int64_t size, double null_probability) { + return rag_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + } + + std::shared_ptr LargeStringArray(int64_t size, double null_probability) { + return rag_.LargeString(size, /*min_length =*/0, /*max_length =*/15, + null_probability); + } + + std::shared_ptr StringViewArray(int64_t size, double null_probability) { + return rag_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + } + + std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, + double null_probability) { + return rag_.ArrayOf(std::move(type), size, null_probability); + } + + // TODO(GH-38656): Use the random array generators from testing/random.h here + + template ::ArrayType> + Result> ListArray(int32_t length, + double null_probability) { + using offset_type = typename ListType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + auto values_size = length * 4; + auto values = PrimitiveArray(values_size, null_probability); + auto offsets_vector = Offsets(values_size, length); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + return ListArrayType::FromArrays(*offsets, *values); } + template ::ArrayType> + Result> ListViewArray(int32_t length, + double null_probability) { + using offset_type = typename ListViewType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + constexpr int kAvgListViewSize = 4; + auto values_size = kAvgListViewSize * length; + + auto values = PrimitiveArray(values_size, null_probability); + + std::shared_ptr offsets; + auto offsets_vector = RandomOffsetsInRange(0, values_size, length); + ArrayFromVector(offsets_vector, &offsets); + + std::shared_ptr sizes; + auto sizes_vector = + ListViewSizes(offsets_vector, values_size, kAvgListViewSize, length); + ArrayFromVector(sizes_vector, &sizes); + + auto validity_bitmap = ValidityBitmap(length, null_probability); + auto valid_count = internal::CountSetBits(validity_bitmap->data(), 0, length); + + return ListViewArrayType::FromArrays( + *offsets, *sizes, *values, default_memory_pool(), + valid_count == length ? nullptr : std::move(validity_bitmap)); + } +}; + +class ConcatenateTest : public ::testing::Test { + private: + std::vector sizes_; + std::vector null_probabilities_; + + protected: + SimpleRandomArrayGenerator rag; + + ConcatenateTest() + : sizes_({0, 1, 2, 4, 16, 31, 1234}), + null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { if (auto preceding_bits = bit_util::kPrecedingBitmask[length % 8]) { auto last_byte = bitmap->data()[length / 8]; ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) - << length << " " << int(preceding_bits); + << length << " " << static_cast(preceding_bits); } } template void Check(ArrayFactory&& factory) { for (auto size : this->sizes_) { - auto offsets = this->Offsets(size, 3); + auto offsets = rag.Offsets(size, 3); for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); ASSERT_OK(expected->ValidateFull()); - auto slices = this->Slices(array, offsets); + auto slices = rag.Slices(array, offsets); for (auto slice : slices) { ASSERT_OK(slice->ValidateFull()); } @@ -111,11 +227,6 @@ class ConcatenateTest : public ::testing::Test { } } } - - random::SeedType seed_ = 0xdeadbeef; - random::RandomArrayGenerator rng_; - std::vector sizes_; - std::vector null_probabilities_; }; TEST(ConcatenateEmptyArraysTest, TestValueBuffersNullPtr) { @@ -144,7 +255,7 @@ TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveArrowTypes); TYPED_TEST(PrimitiveConcatenateTest, Primitives) { this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->template GeneratePrimitive(size, null_probability); + *out = this->rag.template PrimitiveArray(size, null_probability); }); } @@ -156,23 +267,21 @@ TEST_F(ConcatenateTest, NullType) { TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.StringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, - /*max_buffer_length=*/200); + *out = rag.StringViewArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = - rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.LargeStringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } @@ -181,7 +290,7 @@ TEST_F(ConcatenateTest, FixedSizeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { auto list_size = 3; auto values_size = size * list_size; - auto values = this->GeneratePrimitive(values_size, null_probability); + auto values = this->rag.PrimitiveArray(values_size, null_probability); ASSERT_OK_AND_ASSIGN(*out, FixedSizeListArray::FromArrays(values, list_size)); ASSERT_OK((**out).ValidateFull()); }); @@ -189,39 +298,40 @@ TEST_F(ConcatenateTest, FixedSizeListType) { TEST_F(ConcatenateTest, ListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, ListArray::FromArrays(*offsets, *values)); + ASSERT_OK_AND_ASSIGN(*out, this->rag.ListArray(size, null_probability)); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, LargeListArray::FromArrays(*offsets, *values)); + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, ListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListViewArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, LargeListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN( + *out, this->rag.ListViewArray(size, null_probability)); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StructType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); + auto foo = this->rag.PrimitiveArray(size, null_probability); + auto bar = this->rag.PrimitiveArray(size, null_probability); + auto baz = this->rag.PrimitiveArray(size, null_probability); *out = std::make_shared( struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), size, ArrayVector{foo, bar, baz}); @@ -230,8 +340,8 @@ TEST_F(ConcatenateTest, StructType) { TEST_F(ConcatenateTest, DictionaryType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto indices = this->GeneratePrimitive(size, null_probability); - auto dict = this->GeneratePrimitive(128, 0); + auto indices = rag.PrimitiveArray(size, null_probability); + auto dict = rag.PrimitiveArray(128, 0); auto type = dictionary(int32(), dict->type()); *out = std::make_shared(type, indices, dict); }); @@ -382,20 +492,20 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(sparse_union({ - field("a", float64()), - field("b", boolean()), - }), - size, null_probability); + *out = rag.ArrayOf(sparse_union({ + field("a", float64()), + field("b", boolean()), + }), + size, null_probability); }); // dense mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(dense_union({ - field("a", uint32()), - field("b", boolean()), - field("c", int8()), - }), - size, null_probability); + *out = rag.ArrayOf(dense_union({ + field("a", uint32()), + field("b", boolean()), + field("c", int8()), + }), + size, null_probability); }); } @@ -413,7 +523,7 @@ TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { auto type_ids_ok = ArrayFromJSON(int8(), "[0]"); auto offsets_ok = ArrayFromJSON(int32(), "[0]"); auto child_array_overflow = - this->rng_.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); + rag.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); ASSERT_OK_AND_ASSIGN( auto array_overflow, DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow})); @@ -546,7 +656,7 @@ TEST_F(ConcatenateTest, DenseUnionType) { TEST_F(ConcatenateTest, ExtensionType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto storage = this->GeneratePrimitive(size, null_probability); + auto storage = this->rag.PrimitiveArray(size, null_probability); *out = ExtensionType::WrapArray(smallint(), storage); }); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 186682be3009e..c002c0817b194 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -33,6 +33,7 @@ #include "arrow/type_traits.h" #include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/dict_util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ree_util.h" @@ -93,6 +94,10 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } +bool DictionaryMayHaveLogicalNulls(const ArrayData& data) { + return ArraySpan(data).MayHaveLogicalNulls(); +} + BufferSpan PackVariadicBuffers(util::span> buffers) { return {const_cast(reinterpret_cast(buffers.data())), static_cast(buffers.size() * sizeof(std::shared_ptr))}; @@ -174,7 +179,7 @@ int64_t ArrayData::GetNullCount() const { } int64_t ArrayData::ComputeLogicalNullCount() const { - if (this->buffers[0]) { + if (this->buffers[0] && this->type->id() != Type::DICTIONARY) { return GetNullCount(); } return ArraySpan(*this).ComputeLogicalNullCount(); @@ -244,9 +249,22 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { auto* offsets = reinterpret_cast(scratch_space); offsets[0] = 0; offsets[1] = static_cast(value_size); + static_assert(2 * sizeof(offset_type) <= 16); return {scratch_space, sizeof(offset_type) * 2}; } +template +std::pair OffsetsAndSizesForScalar(uint8_t* scratch_space, + offset_type value_size) { + auto* offsets = scratch_space; + auto* sizes = scratch_space + sizeof(offset_type); + reinterpret_cast(offsets)[0] = 0; + reinterpret_cast(sizes)[0] = value_size; + static_assert(2 * sizeof(offset_type) <= 16); + return {BufferSpan{offsets, sizeof(offset_type)}, + BufferSpan{sizes, sizeof(offset_type)}}; +} + int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -261,6 +279,8 @@ int GetNumBuffers(const DataType& type) { case Type::STRING_VIEW: case Type::BINARY_VIEW: case Type::DENSE_UNION: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: return 3; case Type::EXTENSION: // The number of buffers depends on the storage type @@ -381,7 +401,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); this->buffers[1].size = scalar.value->size(); - } else if (is_list_like(type_id)) { + } else if (is_var_length_list_like(type_id) || type_id == Type::FIXED_SIZE_LIST) { const auto& scalar = checked_cast(value); int64_t value_length = 0; @@ -402,7 +422,14 @@ void ArraySpan::FillFromScalar(const Scalar& value) { OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); } else if (type_id == Type::LARGE_LIST) { this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); + } else if (type_id == Type::LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( + scalar.scratch_space_, static_cast(value_length)); + } else if (type_id == Type::LARGE_LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = + OffsetsAndSizesForScalar(scalar.scratch_space_, value_length); } else { + DCHECK_EQ(type_id, Type::FIXED_SIZE_LIST); // FIXED_SIZE_LIST: does not have a second buffer this->buffers[1] = {}; } @@ -520,6 +547,9 @@ int64_t ArraySpan::ComputeLogicalNullCount() const { if (t == Type::RUN_END_ENCODED) { return ree_util::LogicalNullCount(*this); } + if (t == Type::DICTIONARY) { + return dict_util::LogicalNullCount(*this); + } return GetNullCount(); } @@ -617,6 +647,10 @@ bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const { return ree_util::ValuesArray(*this).MayHaveLogicalNulls(); } +bool ArraySpan::DictionaryMayHaveLogicalNulls() const { + return this->GetNullCount() != 0 || this->dictionary().GetNullCount() != 0; +} + // ---------------------------------------------------------------------- // Implement internal::GetArrayView diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 40a77640cd1e5..4c2df8381490a 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -38,7 +38,7 @@ struct ArrayData; namespace internal { // ---------------------------------------------------------------------- -// Null handling for types without a validity bitmap +// Null handling for types without a validity bitmap and the dictionary type ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i); ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i); @@ -46,6 +46,7 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); } // namespace internal // When slicing, we do not know the null count of the sliced range without @@ -280,7 +281,7 @@ struct ARROW_EXPORT ArrayData { /// \brief Return true if the validity bitmap may have 0's in it, or if the /// child arrays (in the case of types without a validity bitmap) may have - /// nulls + /// nulls, or if the dictionary of dictionay array may have nulls. /// /// This is not a drop-in replacement for MayHaveNulls, as historically /// MayHaveNulls() has been used to check for the presence of a validity @@ -325,6 +326,9 @@ struct ARROW_EXPORT ArrayData { if (t == Type::RUN_END_ENCODED) { return internal::RunEndEncodedMayHaveLogicalNulls(*this); } + if (t == Type::DICTIONARY) { + return internal::DictionaryMayHaveLogicalNulls(*this); + } return null_count.load() != 0; } @@ -505,7 +509,7 @@ struct ARROW_EXPORT ArraySpan { /// \brief Return true if the validity bitmap may have 0's in it, or if the /// child arrays (in the case of types without a validity bitmap) may have - /// nulls + /// nulls, or if the dictionary of dictionay array may have nulls. /// /// \see ArrayData::MayHaveLogicalNulls bool MayHaveLogicalNulls() const { @@ -519,6 +523,9 @@ struct ARROW_EXPORT ArraySpan { if (t == Type::RUN_END_ENCODED) { return RunEndEncodedMayHaveLogicalNulls(); } + if (t == Type::DICTIONARY) { + return DictionaryMayHaveLogicalNulls(); + } return null_count != 0; } @@ -560,6 +567,7 @@ struct ARROW_EXPORT ArraySpan { bool UnionMayHaveLogicalNulls() const; bool RunEndEncodedMayHaveLogicalNulls() const; + bool DictionaryMayHaveLogicalNulls() const; }; namespace internal { diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index be9597e59b378..f9714eda34c61 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -289,6 +289,13 @@ class ValueComparatorFactory { Status Visit(const NullType&, const Array&, const Array&) { return Status::NotImplemented("null type"); } + Status Visit(const ListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } + + Status Visit(const LargeListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } Status Visit(const ExtensionType&, const Array&, const Array&) { return Status::NotImplemented("extension type"); @@ -589,6 +596,9 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else if (base.type()->id() == Type::LIST_VIEW || + base.type()->id() == Type::LARGE_LIST_VIEW) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } @@ -732,6 +742,14 @@ class MakeFormatterImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + + Status Visit(const LargeListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + // TODO(bkietz) format maps better Status Visit(const StructType& t) { diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 9ea2fc2b6f0a1..86e2ffcae4de7 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -134,7 +134,6 @@ class ArrayDataEndianSwapper { out_->buffers[index] = data_->buffers[index]; return Status::OK(); } - // Except union, offset has one more element rather than data->length ARROW_ASSIGN_OR_RAISE(out_->buffers[index], ByteSwapBuffer(data_->buffers[index])); return Status::OK(); @@ -290,6 +289,17 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const ListViewType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); + } + Status Visit(const LargeListViewType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); + } + Status Visit(const DictionaryType& type) { // dictionary was already swapped in ReadDictionary() in ipc/reader.cc RETURN_NOT_OK(SwapType(*type.index_type())); @@ -379,7 +389,14 @@ class NullArrayFactory { enable_if_var_size_list Visit(const T& type) { // values array may be empty, but there must be at least one offset of 0 RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1))); - RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& type) { + RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_)); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } @@ -518,8 +535,8 @@ class NullArrayFactory { } template - enable_if_var_size_list Visit(const T& type) { - out_->buffers.resize(2, buffer_); + enable_if_var_length_list_like Visit(const T& type) { + out_->buffers.resize(is_list_view(T::type_id) ? 3 : 2, buffer_); ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, /*length=*/0)); return Status::OK(); } @@ -698,12 +715,28 @@ class RepeatedArrayFactory { std::shared_ptr offsets_buffer; auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared(scalar_.type, length_, offsets_buffer, value_array); return Status::OK(); } + template + enable_if_list_view Visit(const T& type) { + using ScalarType = typename TypeTraits::ScalarType; + using ArrayType = typename TypeTraits::ArrayType; + + auto value = checked_cast(scalar_).value; + + auto size = static_cast(value->length()); + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, + CreateIntBuffer(0)); + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + CreateIntBuffer(size)); + out_ = std::make_shared(scalar_.type, length_, std::move(offsets_buffer), + std::move(sizes_buffer), value); + return Status::OK(); + } + Status Visit(const FixedSizeListType& type) { auto value = checked_cast(scalar_).value; @@ -853,6 +886,15 @@ class RepeatedArrayFactory { return builder.Finish(out); } + template + Result> CreateIntBuffer(IntType value) { + std::shared_ptr buffer; + TypedBufferBuilder builder(pool_); + RETURN_NOT_OK(builder.Append(/*num_copies=*/length_, value)); + RETURN_NOT_OK(builder.Finish(&buffer)); + return buffer; + } + Status CreateBufferOf(const void* data, size_t data_length, std::shared_ptr* out) { BufferBuilder builder(pool_); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3dde41b1450e8..8dd3eb3f90c15 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -23,7 +23,6 @@ #include "arrow/extension_type.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" @@ -269,6 +268,9 @@ struct ValidateArrayImpl { return MapArray::ValidateChildData(data.child_data); } + Status Visit(const ListViewType& type) { return ValidateListView(type); } + Status Visit(const LargeListViewType& type) { return ValidateListView(type); } + Status Visit(const FixedSizeListType& type) { const ArrayData& values = *data.child_data[0]; const int64_t list_size = type.list_size(); @@ -582,7 +584,7 @@ struct ValidateArrayImpl { const Buffer& values = *data.buffers[2]; // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.size())); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.size())); if (data.length > 0 && data.buffers[1]->is_cpu()) { using offset_type = typename BinaryType::offset_type; @@ -702,7 +704,7 @@ struct ValidateArrayImpl { } // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.offset + values.length)); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.offset + values.length)); // An empty list array can have 0 offsets if (data.length > 0 && data.buffers[1]->is_cpu()) { @@ -735,6 +737,18 @@ struct ValidateArrayImpl { return Status::OK(); } + template + Status ValidateListView(const ListViewType& type) { + const ArrayData& values = *data.child_data[0]; + const Status child_valid = RecurseInto(values); + if (!child_valid.ok()) { + return Status::Invalid("List-view child array is invalid: ", + child_valid.ToString()); + } + // For list-views, sizes are validated together with offsets. + return ValidateOffsetsAndSizes(type, /*offset_limit=*/values.length); + } + template Status ValidateRunEndEncoded(const RunEndEncodedType& type) { if (data.child_data.size() != 2) { @@ -797,23 +811,105 @@ struct ValidateArrayImpl { return Status::OK(); } + private: + /// \pre basic validation has already been performed + template + Status FullyValidateOffsets(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + auto prev_offset = offsets[0]; + if (prev_offset < 0) { + return Status::Invalid("Offset invariant failure: array starts at negative offset ", + prev_offset); + } + for (int64_t i = 1; i <= data.length; ++i) { + const auto current_offset = offsets[i]; + if (current_offset < prev_offset) { + return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ", + i, ": ", current_offset, " < ", prev_offset); + } + if (current_offset > offset_limit) { + return Status::Invalid("Offset invariant failure: offset for slot ", i, + " out of bounds: ", current_offset, " > ", offset_limit); + } + prev_offset = current_offset; + } + return Status::OK(); + } + + template + Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: offset for slot ", slot, + " out of bounds. Expected ", offset, + " to be at least 0 and less than ", offset_limit); + } + + template + Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + const auto size = sizes[slot]; + if (size < 0) { + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", size, " < 0"); + } else { + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", offset, " + ", size, " > ", + offset_limit); + } + } + + /// \pre basic validation has already been performed + template + Status FullyValidateOffsetsAndSizes(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + + for (int64_t i = 0; i < data.length; ++i) { + const auto size = sizes[i]; + if (size >= 0) { + const auto offset = offsets[i]; + if (offset < 0 || offset > offset_limit) { + return OutOfBoundsListViewOffset(i, offset_limit); + } + if (size > offset_limit - offset) { + return OutOfBoundsListViewSize(i, offset_limit); + } + } else { + return OutOfBoundsListViewSize(i, offset_limit); + } + } + + return Status::OK(); + } + + public: template - Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) { + Status ValidateOffsetsAndSizes(const TypeClass&, int64_t offset_limit) { using offset_type = typename TypeClass::offset_type; + constexpr bool is_list_view = is_list_view_type::value; - if (!IsBufferValid(1)) { - // For length 0, an empty offsets buffer seems accepted as a special case - // (ARROW-544) - if (data.length > 0) { - return Status::Invalid("Non-empty array but offsets are null"); + const bool non_empty = data.length > 0; + if constexpr (is_list_view) { + if (!IsBufferValid(1)) { + return Status::Invalid("offsets buffer is null"); + } + if (!IsBufferValid(2)) { + return Status::Invalid("sizes buffer is null"); + } + } else { + if (!IsBufferValid(1)) { + // For length 0, an empty offsets buffer is accepted (ARROW-544). + return non_empty ? Status::Invalid("Non-empty array but offsets are null") + : Status::OK(); } - return Status::OK(); } - // An empty list array can have 0 offsets const auto offsets_byte_size = data.buffers[1]->size(); const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0)) - ? data.length + data.offset + 1 + ? data.length + data.offset + (is_list_view ? 0 : 1) : 0; if (offsets_byte_size / static_cast(sizeof(offset_type)) < required_offsets) { @@ -821,28 +917,21 @@ struct ValidateArrayImpl { " isn't large enough for length: ", data.length, " and offset: ", data.offset); } + if constexpr (is_list_view) { + const auto required_sizes = data.length + data.offset; + const auto sizes_bytes_size = data.buffers[2]->size(); + if (sizes_bytes_size / static_cast(sizeof(offset_type)) < required_sizes) { + return Status::Invalid("Sizes buffer size (bytes): ", sizes_bytes_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + } if (full_validation && required_offsets > 0) { - // Validate all offset values - const offset_type* offsets = data.GetValues(1); - - auto prev_offset = offsets[0]; - if (prev_offset < 0) { - return Status::Invalid( - "Offset invariant failure: array starts at negative offset ", prev_offset); - } - for (int64_t i = 1; i <= data.length; ++i) { - const auto current_offset = offsets[i]; - if (current_offset < prev_offset) { - return Status::Invalid( - "Offset invariant failure: non-monotonic offset at slot ", i, ": ", - current_offset, " < ", prev_offset); - } - if (current_offset > offset_limit) { - return Status::Invalid("Offset invariant failure: offset for slot ", i, - " out of bounds: ", current_offset, " > ", offset_limit); - } - prev_offset = current_offset; + if constexpr (is_list_view) { + return FullyValidateOffsetsAndSizes(offset_limit); + } else { + return FullyValidateOffsets(offset_limit); } } return Status::OK(); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index c7e6207bfefa4..7042d9818c691 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -221,6 +221,20 @@ struct MakeBuilderImpl { return Status::OK(); } + Status Visit(const ListViewType& list_view_type) { + std::shared_ptr value_type = list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new ListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + + Status Visit(const LargeListViewType& large_list_view_type) { + std::shared_ptr value_type = large_list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new LargeListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + Status Visit(const MapType& map_type) { ARROW_ASSIGN_OR_RAISE(auto key_builder, ChildBuilder(map_type.key_type())); ARROW_ASSIGN_OR_RAISE(auto item_builder, ChildBuilder(map_type.item_type())); diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 033371d3d6719..238afb0328672 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" #include "arrow/util/small_vector.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" @@ -260,7 +262,7 @@ struct SchemaExporter { // Dictionary type: parent struct describes index type, // child dictionary struct describes value type. RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this)); - dict_exporter_.reset(new SchemaExporter()); + dict_exporter_ = std::make_unique(); RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type())); } else { RETURN_NOT_OK(VisitTypeInline(type, this)); @@ -357,10 +359,14 @@ struct SchemaExporter { Status Visit(const LargeBinaryType& type) { return SetFormat("Z"); } + Status Visit(const BinaryViewType& type) { return SetFormat("vz"); } + Status Visit(const StringType& type) { return SetFormat("u"); } Status Visit(const LargeStringType& type) { return SetFormat("U"); } + Status Visit(const StringViewType& type) { return SetFormat("vu"); } + Status Visit(const Date32Type& type) { return SetFormat("tdD"); } Status Visit(const Date64Type& type) { return SetFormat("tdm"); } @@ -444,6 +450,10 @@ struct SchemaExporter { Status Visit(const LargeListType& type) { return SetFormat("+L"); } + Status Visit(const ListViewType& type) { return SetFormat("+vl"); } + + Status Visit(const LargeListViewType& type) { return SetFormat("+vL"); } + Status Visit(const FixedSizeListType& type) { return SetFormat("+w:" + ToChars(type.list_size())); } @@ -517,13 +527,14 @@ namespace { struct ExportedArrayPrivateData : PoolAllocationMixin { // The buffers are owned by the ArrayData member - StaticVector buffers_; + SmallVector buffers_; struct ArrowArray dictionary_; SmallVector children_; SmallVector child_pointers_; std::shared_ptr data_; std::shared_ptr sync_; + std::vector variadic_buffer_sizes_; ExportedArrayPrivateData() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData); @@ -566,15 +577,32 @@ struct ArrayExporter { --n_buffers; ++buffers_begin; } + + bool need_variadic_buffer_sizes = + data->type->id() == Type::BINARY_VIEW || data->type->id() == Type::STRING_VIEW; + if (need_variadic_buffer_sizes) { + ++n_buffers; + } + export_.buffers_.resize(n_buffers); std::transform(buffers_begin, data->buffers.end(), export_.buffers_.begin(), [](const std::shared_ptr& buffer) -> const void* { return buffer ? buffer->data() : nullptr; }); + if (need_variadic_buffer_sizes) { + auto variadic_buffers = util::span(data->buffers).subspan(2); + export_.variadic_buffer_sizes_.resize(variadic_buffers.size()); + size_t i = 0; + for (const auto& buf : variadic_buffers) { + export_.variadic_buffer_sizes_[i++] = buf->size(); + } + export_.buffers_.back() = export_.variadic_buffer_sizes_.data(); + } + // Export dictionary if (data->dictionary != nullptr) { - dict_exporter_.reset(new ArrayExporter()); + dict_exporter_ = std::make_unique(); RETURN_NOT_OK(dict_exporter_->Export(data->dictionary)); } @@ -791,7 +819,7 @@ Status InvalidFormatString(std::string_view v) { class FormatStringParser { public: - FormatStringParser() {} + FormatStringParser() = default; explicit FormatStringParser(std::string_view v) : view_(v), index_(0) {} @@ -937,8 +965,6 @@ Result DecodeMetadata(const char* metadata) { } struct SchemaImporter { - SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {} - Status Import(struct ArrowSchema* src) { if (ArrowSchemaIsReleased(src)) { return Status::Invalid("Cannot import released ArrowSchema"); @@ -1064,6 +1090,8 @@ struct SchemaImporter { return ProcessPrimitive(binary()); case 'Z': return ProcessPrimitive(large_binary()); + case 'v': + return ProcessBinaryView(); case 'w': return ProcessFixedSizeBinary(); case 'd': @@ -1076,6 +1104,17 @@ struct SchemaImporter { return f_parser_.Invalid(); } + Status ProcessBinaryView() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'z': + return ProcessPrimitive(binary_view()); + case 'u': + return ProcessPrimitive(utf8_view()); + } + return f_parser_.Invalid(); + } + Status ProcessTemporal() { RETURN_NOT_OK(f_parser_.CheckHasNext()); switch (f_parser_.Next()) { @@ -1100,6 +1139,16 @@ struct SchemaImporter { return ProcessListLike(); case 'L': return ProcessListLike(); + case 'v': { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'l': + return ProcessListView(); + case 'L': + return ProcessListView(); + } + break; + } case 'w': return ProcessFixedSizeList(); case 's': @@ -1204,6 +1253,15 @@ struct SchemaImporter { return Status::OK(); } + template + Status ProcessListView() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + type_ = std::make_shared(std::move(field)); + return Status::OK(); + } + Status ProcessMap() { RETURN_NOT_OK(f_parser_.CheckAtEnd()); RETURN_NOT_OK(CheckNumChildren(1)); @@ -1337,8 +1395,8 @@ struct SchemaImporter { return Status::OK(); } - struct ArrowSchema* c_struct_; - SchemaExportGuard guard_; + struct ArrowSchema* c_struct_{nullptr}; + SchemaExportGuard guard_{nullptr}; FormatStringParser f_parser_; int64_t recursion_level_; std::vector child_importers_; @@ -1406,7 +1464,7 @@ class ImportedBuffer : public Buffer { std::shared_ptr import) : Buffer(data, size, mm, nullptr, device_type), import_(std::move(import)) {} - ~ImportedBuffer() override {} + ~ImportedBuffer() override = default; std::shared_ptr device_sync_event() override { return import_->device_sync_; @@ -1418,9 +1476,7 @@ class ImportedBuffer : public Buffer { struct ArrayImporter { explicit ArrayImporter(const std::shared_ptr& type) - : type_(type), - zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)), - device_type_(DeviceAllocationType::kCPU) {} + : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} Status Import(struct ArrowDeviceArray* src, const DeviceMemoryMapper& mapper) { ARROW_ASSIGN_OR_RAISE(memory_mgr_, mapper(src->device_type, src->device_id)); @@ -1568,10 +1624,18 @@ struct ArrayImporter { Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); } + Status Visit(const StringViewType& type) { return ImportBinaryView(type); } + + Status Visit(const BinaryViewType& type) { return ImportBinaryView(type); } + Status Visit(const ListType& type) { return ImportListLike(type); } Status Visit(const LargeListType& type) { return ImportListLike(type); } + Status Visit(const ListViewType& type) { return ImportListView(type); } + + Status Visit(const LargeListViewType& type) { return ImportListView(type); } + Status Visit(const FixedSizeListType& type) { RETURN_NOT_OK(CheckNumChildren(1)); RETURN_NOT_OK(CheckNumBuffers(1)); @@ -1646,6 +1710,28 @@ struct ArrayImporter { return Status::OK(); } + Status ImportBinaryView(const BinaryViewType&) { + RETURN_NOT_OK(CheckNoChildren()); + if (c_struct_->n_buffers < 3) { + return Status::Invalid("Expected at least 3 buffers for imported type ", + type_->ToString(), ", ArrowArray struct has ", + c_struct_->n_buffers); + } + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportFixedSizeBuffer(1, BinaryViewType::kSize)); + + // The last C data buffer stores buffer sizes, and shouldn't be imported + auto* buffer_sizes = + static_cast(c_struct_->buffers[c_struct_->n_buffers - 1]); + + for (int32_t buffer_id = 2; buffer_id < c_struct_->n_buffers - 1; ++buffer_id) { + RETURN_NOT_OK(ImportBuffer(buffer_id, buffer_sizes[buffer_id - 2])); + } + data_->buffers.pop_back(); + return Status::OK(); + } + template Status ImportStringLike(const StringType& type) { RETURN_NOT_OK(CheckNoChildren()); @@ -1667,6 +1753,18 @@ struct ArrayImporter { return Status::OK(); } + template + Status ImportListView(const ListViewType& type) { + using offset_type = typename ListViewType::offset_type; + RETURN_NOT_OK(CheckNumChildren(1)); + RETURN_NOT_OK(CheckNumBuffers(3)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK((ImportOffsetsBuffer(1))); + RETURN_NOT_OK(ImportSizesBuffer(2)); + return Status::OK(); + } + Status CheckNoChildren() { return CheckNumChildren(0); } Status CheckNumChildren(int64_t n_children) { @@ -1735,11 +1833,18 @@ struct ArrayImporter { return ImportBuffer(buffer_id, buffer_size); } - template + template Status ImportOffsetsBuffer(int32_t buffer_id) { // Compute visible size of buffer - int64_t buffer_size = - sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1); + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + + (with_extra_offset ? 1 : 0)); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportSizesBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset); return ImportBuffer(buffer_id, buffer_size); } @@ -1790,7 +1895,7 @@ struct ArrayImporter { std::shared_ptr zero_size_buffer_; std::shared_ptr memory_mgr_; - DeviceAllocationType device_type_; + DeviceAllocationType device_type_{DeviceAllocationType::kCPU}; }; } // namespace @@ -1996,7 +2101,7 @@ class ArrayStreamBatchReader : public RecordBatchReader { DCHECK(!ArrowArrayStreamIsReleased(&stream_)); } - ~ArrayStreamBatchReader() { + ~ArrayStreamBatchReader() override { if (!ArrowArrayStreamIsReleased(&stream_)) { ArrowArrayStreamRelease(&stream_); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bd0e498a9f332..326c67f5eceac 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -33,15 +33,18 @@ #include "arrow/c/util_internal.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" +#include "arrow/testing/builder.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/util.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/range.h" // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed #ifdef ARROW_COMPUTE @@ -57,6 +60,7 @@ using internal::ArrayStreamExportTraits; using internal::checked_cast; using internal::SchemaExportGuard; using internal::SchemaExportTraits; +using internal::Zip; template struct ExportTraits {}; @@ -90,7 +94,7 @@ class ReleaseCallback { public: using CType = typename Traits::CType; - explicit ReleaseCallback(CType* c_struct) : called_(false) { + explicit ReleaseCallback(CType* c_struct) { orig_release_ = c_struct->release; orig_private_data_ = c_struct->private_data; c_struct->release = StaticRelease; @@ -122,7 +126,7 @@ class ReleaseCallback { private: ARROW_DISALLOW_COPY_AND_ASSIGN(ReleaseCallback); - bool called_; + bool called_{false}; void (*orig_release_)(CType*); void* orig_private_data_; }; @@ -237,8 +241,7 @@ struct SchemaExportChecker { flattened_flags.empty() ? std::vector(flattened_formats_.size(), kDefaultFlags) : std::move(flattened_flags)), - flattened_metadata_(std::move(flattened_metadata)), - flattened_index_(0) {} + flattened_metadata_(std::move(flattened_metadata)) {} void operator()(struct ArrowSchema* c_export, bool inner = false) { ASSERT_LT(flattened_index_, flattened_formats_.size()); @@ -287,7 +290,7 @@ struct SchemaExportChecker { const std::vector flattened_names_; std::vector flattened_flags_; const std::vector flattened_metadata_; - size_t flattened_index_; + size_t flattened_index_{0}; }; class TestSchemaExport : public ::testing::Test { @@ -353,6 +356,8 @@ TEST_F(TestSchemaExport, Primitive) { TestPrimitive(large_binary(), "Z"); TestPrimitive(utf8(), "u"); TestPrimitive(large_utf8(), "U"); + TestPrimitive(binary_view(), "vz"); + TestPrimitive(utf8_view(), "vu"); TestPrimitive(decimal(16, 4), "d:16,4"); TestPrimitive(decimal256(16, 4), "d:16,4,256"); @@ -397,6 +402,14 @@ TEST_F(TestSchemaExport, List) { TestNested(list(large_list(int32())), {"+l", "+L", "i"}, {"", "item", "item"}); } +TEST_F(TestSchemaExport, ListView) { + TestNested(list_view(int8()), {"+vl", "c"}, {"", "item"}); + TestNested(large_list_view(uint16()), {"+vL", "S"}, {"", "item"}); + + TestNested(list_view(large_list_view(int32())), {"+vl", "+vL", "i"}, + {"", "item", "item"}); +} + TEST_F(TestSchemaExport, Struct) { auto type = struct_({field("a", int8()), field("b", utf8())}); TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, @@ -556,12 +569,24 @@ struct ArrayExportChecker { --expected_n_buffers; ++expected_buffers; } - ASSERT_EQ(c_export->n_buffers, expected_n_buffers); + bool has_variadic_buffer_sizes = expected_data.type->id() == Type::STRING_VIEW || + expected_data.type->id() == Type::BINARY_VIEW; + ASSERT_EQ(c_export->n_buffers, expected_n_buffers + has_variadic_buffer_sizes); ASSERT_NE(c_export->buffers, nullptr); - for (int64_t i = 0; i < c_export->n_buffers; ++i) { + + for (int64_t i = 0; i < expected_n_buffers; ++i) { auto expected_ptr = expected_buffers[i] ? expected_buffers[i]->data() : nullptr; ASSERT_EQ(c_export->buffers[i], expected_ptr); } + if (has_variadic_buffer_sizes) { + auto variadic_buffers = util::span(expected_data.buffers).subspan(2); + auto variadic_buffer_sizes = util::span( + static_cast(c_export->buffers[c_export->n_buffers - 1]), + variadic_buffers.size()); + for (auto [buf, size] : Zip(variadic_buffers, variadic_buffer_sizes)) { + ASSERT_EQ(buf->size(), size); + } + } if (expected_data.dictionary != nullptr) { // Recurse into dictionary @@ -874,6 +899,8 @@ TEST_F(TestArrayExport, Primitive) { TestPrimitive(large_binary(), R"(["foo", "bar", null])"); TestPrimitive(utf8(), R"(["foo", "bar", null])"); TestPrimitive(large_utf8(), R"(["foo", "bar", null])"); + TestPrimitive(binary_view(), R"(["foo", "bar", null])"); + TestPrimitive(utf8_view(), R"(["foo", "bar", null])"); TestPrimitive(decimal(16, 4), R"(["1234.5670", null])"); TestPrimitive(decimal256(16, 4), R"(["1234.5670", null])"); @@ -887,6 +914,39 @@ TEST_F(TestArrayExport, PrimitiveSliced) { TestPrimitive(factory); } +constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux", + binary_view_buffer_content1 = "BinaryViewMultipleBuffers"; + +static const BinaryViewType::c_type binary_view_buffer1[] = { + util::ToBinaryView(binary_view_buffer_content0, 0, 0), + util::ToInlineBinaryView("foo"), + util::ToBinaryView(binary_view_buffer_content1, 1, 0), + util::ToInlineBinaryView("bar"), + util::ToBinaryView(binary_view_buffer_content0.substr(5), 0, 5), + util::ToInlineBinaryView("baz"), + util::ToBinaryView(binary_view_buffer_content1.substr(6, 13), 1, 6), + util::ToInlineBinaryView("quux"), +}; + +static auto MakeBinaryViewArrayWithMultipleDataBuffers() { + static const auto kLength = static_cast(std::size(binary_view_buffer1)); + return std::make_shared( + binary_view(), kLength, + Buffer::FromVector(std::vector(binary_view_buffer1, binary_view_buffer1 + kLength)), + BufferVector{ + Buffer::FromString(std::string{binary_view_buffer_content0}), + Buffer::FromString(std::string{binary_view_buffer_content1}), + }); +} + +TEST_F(TestArrayExport, BinaryViewMultipleBuffers) { + TestPrimitive(MakeBinaryViewArrayWithMultipleDataBuffers); + TestPrimitive([&] { + auto arr = MakeBinaryViewArrayWithMultipleDataBuffers(); + return arr->Slice(1, arr->length() - 2); + }); +} + TEST_F(TestArrayExport, Null) { TestPrimitive(null(), "[null, null, null]"); TestPrimitive(null(), "[]"); @@ -945,6 +1005,33 @@ TEST_F(TestArrayExport, ListSliced) { } } +TEST_F(TestArrayExport, ListView) { + TestNested(list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(large_list_view(uint16()), "[[1, 2], [3, null], null]"); + TestNested(fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestArrayExport, ListViewSliced) { + { + auto factory = []() { + return ArrayFromJSON(list_view(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = []() { + auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); + auto offsets = ArrayFromJSON(int32(), "[5, 2, 0, 3]")->Slice(1, 2); + auto sizes = ArrayFromJSON(int32(), "[2, 3, 6, 1]")->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestArrayExport, Struct) { const char* data = R"([[1, "foo"], [2, null]])"; auto type = struct_({field("a", int8()), field("b", utf8())}); @@ -1184,13 +1271,16 @@ TEST_F(TestArrayExport, ExportRecordBatch) { static const char kMyDeviceTypeName[] = "arrowtest::MyDevice"; static const ArrowDeviceType kMyDeviceType = ARROW_DEVICE_EXT_DEV; -static const void* kMyEventPtr = reinterpret_cast(uintptr_t(0xBAADF00D)); +static const void* kMyEventPtr = + reinterpret_cast(static_cast(0xBAADF00D)); class MyBuffer final : public MutableBuffer { public: using MutableBuffer::MutableBuffer; - ~MyBuffer() { default_memory_pool()->Free(const_cast(data_), size_); } + ~MyBuffer() override { + default_memory_pool()->Free(const_cast(data_), size_); + } std::shared_ptr device_sync_event() override { return device_sync_; } @@ -1220,7 +1310,7 @@ class MyDevice : public Device { explicit MySyncEvent(void* sync_event, release_fn_t release_sync_event) : Device::SyncEvent(sync_event, release_sync_event) {} - virtual ~MySyncEvent() = default; + ~MySyncEvent() override = default; Status Wait() override { return Status::OK(); } Status Record(const Device::Stream&) override { return Status::OK(); } }; @@ -1490,6 +1580,45 @@ TEST_F(TestDeviceArrayExport, ListSliced) { } } +TEST_F(TestDeviceArrayExport, ListView) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestNested(mm, list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(mm, large_list_view(uint16()), "[[1, 2], [3, null], null]"); + + TestNested(mm, list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestDeviceArrayExport, ListViewSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + { + auto factory = [=]() { + return (*ToDevice(mm, *ArrayFromJSON(list_view(int8()), + "[[1, 2], [3, null], [4, 5, 6], null]") + ->data())) + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = [=]() { + auto values = + (*ToDevice(mm, + *ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->data())) + ->Slice(1, 6); + auto offsets = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[5, 2, 0, 3]")->data()))->Slice(1, 2); + auto sizes = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[2, 3, 6, 1]")->data()))->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestDeviceArrayExport, Struct) { std::shared_ptr device = std::make_shared(1); auto mm = device->default_memory_manager(); @@ -1891,6 +2020,10 @@ TEST_F(TestSchemaImport, String) { CheckImport(large_utf8()); FillPrimitive("Z"); CheckImport(large_binary()); + FillPrimitive("vu"); + CheckImport(utf8_view()); + FillPrimitive("vz"); + CheckImport(binary_view()); FillPrimitive("w:3"); CheckImport(fixed_size_binary(3)); @@ -1930,6 +2063,33 @@ TEST_F(TestSchemaImport, NestedList) { CheckImport(list(fixed_size_list(int8(), 3))); } +TEST_F(TestSchemaImport, ListView) { + FillPrimitive(AddChild(), "c"); + FillListLike("+vl"); + CheckImport(list_view(int8())); + + FillPrimitive(AddChild(), "s", "item", 0); + FillListLike("+vl"); + CheckImport(list_view(field("item", int16(), /*nullable=*/false))); + + // Large list-view + FillPrimitive(AddChild(), "s"); + FillListLike("+vL"); + CheckImport(large_list_view(int16())); +} + +TEST_F(TestSchemaImport, NestedListView) { + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+vl"); + FillListLike("+vL"); + CheckImport(large_list_view(list_view(int8()))); + + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+w:3"); + FillListLike("+vl"); + CheckImport(list_view(fixed_size_list(int8(), 3))); +} + TEST_F(TestSchemaImport, Struct) { FillPrimitive(AddChild(), "u", "strs"); FillPrimitive(AddChild(), "S", "ints"); @@ -2317,6 +2477,16 @@ static const void* large_string_buffers_no_nulls1[3] = { static const void* large_string_buffers_omitted[3] = { nullptr, large_string_offsets_buffer1, nullptr}; +constexpr int64_t binary_view_buffer_sizes1[] = {binary_view_buffer_content0.size(), + binary_view_buffer_content1.size()}; +static const void* binary_view_buffers_no_nulls1[] = { + nullptr, + binary_view_buffer1, + binary_view_buffer_content0.data(), + binary_view_buffer_content1.data(), + binary_view_buffer_sizes1, +}; + static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1}; static const void* list_buffers_nulls1[2] = {bits_buffer1, list_offsets_buffer1}; @@ -2325,6 +2495,18 @@ static const int64_t large_list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* large_list_buffers_no_nulls1[2] = {nullptr, large_list_offsets_buffer1}; +static const int32_t list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int32_t list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* list_view_buffers_no_nulls1[3] = {nullptr, list_view_offsets_buffer1, + list_view_sizes_buffer1}; +static const void* list_view_buffers_nulls1[3] = {bits_buffer1, list_view_offsets_buffer1, + list_view_sizes_buffer1}; + +static const int64_t large_list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int64_t large_list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* large_list_view_buffers_no_nulls1[3] = { + nullptr, large_list_view_offsets_buffer1, large_list_view_sizes_buffer1}; + static const int8_t type_codes_buffer1[] = {42, 42, 43, 43, 42}; static const int32_t union_offsets_buffer1[] = {0, 1, 0, 1, 2}; static const void* sparse_union_buffers1_legacy[2] = {nullptr, type_codes_buffer1}; @@ -2396,6 +2578,16 @@ class TestArrayImport : public ::testing::Test { c->buffers = buffers; } + void FillStringViewLike(struct ArrowArray* c, int64_t length, int64_t null_count, + int64_t offset, const void** buffers, + int32_t data_buffer_count) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 2 + data_buffer_count + 1; + c->buffers = buffers; + } + void FillListLike(struct ArrowArray* c, int64_t length, int64_t null_count, int64_t offset, const void** buffers) { c->length = length; @@ -2407,6 +2599,17 @@ class TestArrayImport : public ::testing::Test { c->children = NLastChildren(1, c); } + void FillListView(struct ArrowArray* c, int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + } + void FillFixedSizeListLike(struct ArrowArray* c, int64_t length, int64_t null_count, int64_t offset, const void** buffers) { c->length = length; @@ -2458,11 +2661,22 @@ class TestArrayImport : public ::testing::Test { FillStringLike(&c_struct_, length, null_count, offset, buffers); } + void FillStringViewLike(int64_t length, int64_t null_count, int64_t offset, + const void** buffers, int32_t data_buffer_count) { + FillStringViewLike(&c_struct_, length, null_count, offset, buffers, + data_buffer_count); + } + void FillListLike(int64_t length, int64_t null_count, int64_t offset, const void** buffers) { FillListLike(&c_struct_, length, null_count, offset, buffers); } + void FillListView(int64_t length, int64_t null_count, int64_t offset, + const void** buffers) { + FillListView(&c_struct_, length, null_count, offset, buffers); + } + void FillFixedSizeListLike(int64_t length, int64_t null_count, int64_t offset, const void** buffers) { FillFixedSizeListLike(&c_struct_, length, null_count, offset, buffers); @@ -2704,6 +2918,10 @@ TEST_F(TestArrayImport, String) { FillStringLike(4, 0, 0, large_string_buffers_no_nulls1); CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])")); + auto length = static_cast(std::size(binary_view_buffer1)); + FillStringViewLike(length, 0, 0, binary_view_buffers_no_nulls1, 2); + CheckImport(MakeBinaryViewArrayWithMultipleDataBuffers()); + // Empty array with null data pointers FillStringLike(0, 0, 0, string_buffers_omitted); CheckImport(ArrayFromJSON(utf8(), "[]")); @@ -2820,6 +3038,53 @@ TEST_F(TestArrayImport, ListWithOffset) { "[[6, 7, 8], [9, 10, 11], [12, 13, 14]]")); } +TEST_F(TestArrayImport, ListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[1, 2], [], [3, 4, 5], [6], [7, 8]]")); + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 1, 0, list_view_buffers_nulls1); + CheckImport( + ArrayFromJSON(list_view(int16()), "[[513, 1027], null, [1541, 2055, 2569]]")); + + // Large list-view + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(large_list_view(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); +} + +TEST_F(TestArrayImport, NestedListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(AddChild(), 5, 0, 0, list_view_buffers_no_nulls1); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_list_view(list_view(int8())), + "[[[1, 2], []], [], [[3, 4, 5], [6], [7, 8]]]")); + + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls1_8); + FillFixedSizeListLike(AddChild(), 2, 0, 0, buffers_no_nulls_no_data); + FillListView(2, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(fixed_size_list(int8(), 3)), + "[[[1, 2, 3], [4, 5, 6]], []]")); +} + +TEST_F(TestArrayImport, ListViewWithOffset) { + // Offset in child + FillPrimitive(AddChild(), 8, 0, 1, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[2, 3], [], [4, 5, 6], [7], [8, 9]]")); + + // Offset in parent + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [3, 4, 5], [6], [7, 8]]")); + + // Both + FillPrimitive(AddChild(), 8, 0, 2, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [5, 6, 7], [8], [9, 10]]")); +} + TEST_F(TestArrayImport, Struct) { FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1_16); @@ -3117,6 +3382,17 @@ TEST_F(TestArrayImport, ListError) { CheckImportError(list(int8())); } +TEST_F(TestArrayImport, ListViewNoError) { + // Unlike with lists, importing a length-0 list-view with all buffers ommitted is + // not an error. List-views don't need an extra offset value, so an empty offsets + // buffer is valid in this case. + + // Null offsets pointer + FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(list_view(int8()), "[]")); +} + TEST_F(TestArrayImport, MapError) { // Bad number of (struct) children in map child FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); @@ -3342,15 +3618,16 @@ TEST_F(TestSchemaRoundtrip, Primitive) { TestWithTypeFactory(boolean); TestWithTypeFactory(float16); - TestWithTypeFactory(std::bind(decimal128, 19, 4)); - TestWithTypeFactory(std::bind(decimal256, 19, 4)); - TestWithTypeFactory(std::bind(decimal128, 19, 0)); - TestWithTypeFactory(std::bind(decimal256, 19, 0)); - TestWithTypeFactory(std::bind(decimal128, 19, -5)); - TestWithTypeFactory(std::bind(decimal256, 19, -5)); - TestWithTypeFactory(std::bind(fixed_size_binary, 3)); + TestWithTypeFactory([] { return decimal128(19, 4); }); + TestWithTypeFactory([] { return decimal256(19, 4); }); + TestWithTypeFactory([] { return decimal128(19, 0); }); + TestWithTypeFactory([] { return decimal256(19, 0); }); + TestWithTypeFactory([] { return decimal128(19, -5); }); + TestWithTypeFactory([] { return decimal256(19, -5); }); + TestWithTypeFactory([] { return fixed_size_binary(3); }); TestWithTypeFactory(binary); TestWithTypeFactory(large_utf8); + TestWithTypeFactory(binary_view); } TEST_F(TestSchemaRoundtrip, Temporal) { @@ -3358,8 +3635,8 @@ TEST_F(TestSchemaRoundtrip, Temporal) { TestWithTypeFactory(day_time_interval); TestWithTypeFactory(month_interval); TestWithTypeFactory(month_day_nano_interval); - TestWithTypeFactory(std::bind(time64, TimeUnit::NANO)); - TestWithTypeFactory(std::bind(duration, TimeUnit::MICRO)); + TestWithTypeFactory([] { return time64(TimeUnit::NANO); }); + TestWithTypeFactory([] { return duration(TimeUnit::MICRO); }); TestWithTypeFactory([]() { return arrow::timestamp(TimeUnit::MICRO, "Europe/Paris"); }); } @@ -3370,6 +3647,12 @@ TEST_F(TestSchemaRoundtrip, List) { TestWithTypeFactory([]() { return list(fixed_size_list(utf8(), 5)); }); } +TEST_F(TestSchemaRoundtrip, ListView) { + TestWithTypeFactory([]() { return list_view(utf8()); }); + TestWithTypeFactory([]() { return large_list_view(list_view(utf8())); }); + TestWithTypeFactory([]() { return list_view(fixed_size_list(utf8(), 5)); }); +} + TEST_F(TestSchemaRoundtrip, Struct) { auto f1 = field("f1", utf8(), /*nullable=*/false); auto f2 = field("f2", list(decimal(19, 4))); @@ -3609,6 +3892,14 @@ TEST_F(TestArrayRoundtrip, Primitive) { R"([[4, 5, 6], [1, -600, 5000], null, null])"); } +TEST_F(TestArrayRoundtrip, BinaryViewMultipleBuffers) { + TestWithArrayFactory(MakeBinaryViewArrayWithMultipleDataBuffers); + TestWithArrayFactory([&] { + auto arr = MakeBinaryViewArrayWithMultipleDataBuffers(); + return arr->Slice(1, arr->length() - 2); + }); +} + TEST_F(TestArrayRoundtrip, UnknownNullCount) { TestWithArrayFactory([]() -> Result> { auto arr = ArrayFromJSON(int32(), "[0, 1, 2]"); @@ -3631,6 +3922,31 @@ TEST_F(TestArrayRoundtrip, List) { TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); } +TEST_F(TestArrayRoundtrip, ListView) { + TestWithJSON(list_view(int32()), "[]"); + TestWithJSON(list_view(int32()), "[[4, 5], [6, null], null]"); + + TestWithJSONSliced(list_view(int32()), "[[4, 5], [6, null], null]"); + + // Out-of-order offsets + TestWithArrayFactory([this]() -> Result> { + std::shared_ptr offsets; + ArrayFromVector(int32(), + std::vector{false, true, true, true, false, true}, + std::vector{4, 2, 1, 3, 3, 2}, &offsets); + + std::shared_ptr sizes; + ArrayFromVector(std::vector{2, 2, 3, 1, 2, 0}, &sizes); + + auto values = ArrayFromJSON(int8(), "[4, 5, 6, null, 8, null]"); + auto result = ListViewArray::FromArrays(*offsets, *sizes, *values, pool_); + if (result.ok()) { + RETURN_NOT_OK((*result)->ValidateFull()); + } + return result; + }); +} + TEST_F(TestArrayRoundtrip, Struct) { auto type = struct_({field("ints", int16()), field("bools", boolean())}); TestWithJSON(type, "[]"); @@ -3986,8 +4302,6 @@ TEST_F(TestDeviceArrayRoundtrip, Primitive) { TestWithJSON(mm, int32(), "[4, 5, null]"); } -// TODO C -> C++ -> C roundtripping tests? - //////////////////////////////////////////////////////////////////////////// // Array stream export tests diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 50cfdd05a14bb..bb632e2eb912d 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -308,6 +308,10 @@ class RangeDataEqualsImpl { Status Visit(const LargeListType& type) { return CompareList(type); } + Status Visit(const ListViewType& type) { return CompareListView(type); } + + Status Visit(const LargeListViewType& type) { return CompareListView(type); } + Status Visit(const FixedSizeListType& type) { const auto list_size = type.list_size(); const ArrayData& left_data = *left_.child_data[0]; @@ -493,6 +497,38 @@ class RangeDataEqualsImpl { return Status::OK(); } + template + Status CompareListView(const TypeClass& type) { + const ArrayData& left_values = *left_.child_data[0]; + const ArrayData& right_values = *right_.child_data[0]; + + using offset_type = typename TypeClass::offset_type; + const auto* left_offsets = left_.GetValues(1) + left_start_idx_; + const auto* right_offsets = right_.GetValues(1) + right_start_idx_; + const auto* left_sizes = left_.GetValues(2) + left_start_idx_; + const auto* right_sizes = right_.GetValues(2) + right_start_idx_; + + auto compare_view = [&](int64_t i, int64_t length) -> bool { + for (int64_t j = i; j < i + length; ++j) { + if (left_sizes[j] != right_sizes[j]) { + return false; + } + const offset_type size = left_sizes[j]; + if (size == 0) { + continue; + } + RangeDataEqualsImpl impl(options_, floating_approximate_, left_values, + right_values, left_offsets[j], right_offsets[j], size); + if (!impl.Compare()) { + return false; + } + } + return true; + }; + VisitValidRuns(std::move(compare_view)); + return Status::OK(); + } + template Status CompareRunEndEncoded() { auto left_span = ArraySpan(left_); @@ -699,7 +735,8 @@ class TypeEqualsVisitor { } template - enable_if_t::value, Status> Visit(const T& left) { + enable_if_t::value || is_list_view_type::value, Status> Visit( + const T& left) { std::shared_ptr left_field = left.field(0); std::shared_ptr right_field = checked_cast(right_).field(0); bool equal_names = !check_metadata_ || (left_field->name() == right_field->name()); @@ -857,6 +894,18 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const ListViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); + } + + Status Visit(const LargeListViewScalar& left) { + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); + } + Status Visit(const MapScalar& left) { const auto& right = checked_cast(right_); result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 5b5b5718e19dc..1adb3e96c97c8 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -38,6 +38,12 @@ #include "arrow/util/macros.h" #include "arrow/util/visibility.h" +// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h. +// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum. +#if defined(__APPLE__) && defined(PREALLOCATE) +#undef PREALLOCATE +#endif + namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 6b4b2339e4afe..ee181c053c053 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -82,9 +82,9 @@ std::optional GetConstantValidityWord(const ExecValue& data) { return {}; } -// if the condition is null then output is null otherwise we take validity from the -// selected argument -// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid) +/// If the condition is null then output is null otherwise we take validity from the +/// selected argument +/// (i.e. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)). struct IfElseNullPromoter { KernelContext* ctx; const ArraySpan& cond; @@ -368,7 +368,7 @@ void RunIfElseLoopInverted(const ArraySpan& cond, const HandleBlock& handle_bloc } /// Runs if-else when cond is a scalar. Two special functions are required, -/// 1.CopyArrayData, 2. BroadcastScalar +/// 1. CopyArrayData, 2. BroadcastScalar template Status RunIfElseScalar(const BooleanScalar& cond, const ExecValue& left, const ExecValue& right, ExecResult* out, @@ -1028,7 +1028,7 @@ struct NestedIfElseExec { // AAA static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1041,7 +1041,7 @@ struct NestedIfElseExec { // ASA static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1054,7 +1054,7 @@ struct NestedIfElseExec { // AAS static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1067,7 +1067,7 @@ struct NestedIfElseExec { // ASS static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1078,8 +1078,9 @@ struct NestedIfElseExec { } template - static Status RunLoop(KernelContext* ctx, const ArraySpan& cond, ExecResult* out, - HandleLeft&& handle_left, HandleRight&& handle_right) { + static Status RunLoopOfNestedIfElseExec(KernelContext* ctx, const ArraySpan& cond, + ExecResult* out, HandleLeft&& handle_left, + HandleRight&& handle_right) { std::unique_ptr raw_builder; RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(), &raw_builder)); @@ -1308,9 +1309,9 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : - {Type::LIST, Type::LARGE_LIST, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, + Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 34225ce9fe084..b72402bbccd4e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -67,11 +67,15 @@ struct GetBytesProcessedVisitor { } template - enable_if_var_size_list Visit(const ArrowType& type) { + enable_if_var_length_list_like Visit(const ArrowType& type) { using ArrayType = typename TypeTraits::ArrayType; using OffsetType = typename TypeTraits::OffsetType::c_type; - total_bytes += (arr->length() + 1) * sizeof(OffsetType); + const auto num_offsets = is_list_view(type) ? arr->length() : arr->length() + 1; + total_bytes += num_offsets * sizeof(OffsetType); + // NOTE: the sizes buffer is not counted when type is a list-view as that + // can make the throughput numbers look better just because the sizes + // increase the number of bytes in the input. auto child_array = internal::checked_cast(arr)->values(); return RecurseInto(child_array.get()); } @@ -126,7 +130,7 @@ static void IfElseBench(benchmark::State& state) { } template -static void IfElseBenchList(benchmark::State& state) { +static void IfElseBenchVarLengthListLike(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBench(state, list_type); @@ -172,7 +176,7 @@ static void IfElseBenchContiguous(benchmark::State& state) { } template -static void IfElseBenchListContiguous(benchmark::State& state) { +static void IfElseBenchVarLengthListLikeContiguous(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBenchContiguous(state, list_type); @@ -187,11 +191,11 @@ static void IfElseBench32(benchmark::State& state) { } static void IfElseBenchListUInt32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchListString32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchString32(benchmark::State& state) { @@ -211,11 +215,27 @@ static void IfElseBench32Contiguous(benchmark::State& state) { } static void IfElseBenchListUInt32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchListString32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewUInt32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewString32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewUInt32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewString32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchString64Contiguous(benchmark::State& state) { @@ -494,6 +514,12 @@ BENCHMARK(IfElseBenchListString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListUInt32Contiguous)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListString32Contiguous)->Args({kNumItems, 0}); +// IfElse: ListViews +BENCHMARK(IfElseBenchListViewUInt32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewUInt32Contiguous)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32Contiguous)->Args({kNumItems, 0}); + // IfElse: Strings BENCHMARK(IfElseBenchString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchString64)->Args({kNumItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index a9c5a1fc3c96f..a11aab81742ed 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -737,12 +737,15 @@ TEST_F(TestIfElseKernel, Decimal) { } } +using ListAndListViewArrowTypes = + ::testing::Types; + template -class TestIfElseList : public ::testing::Test {}; +class TestIfElseVarLengthListLike : public ::testing::Test {}; -TYPED_TEST_SUITE(TestIfElseList, ListArrowTypes); +TYPED_TEST_SUITE(TestIfElseVarLengthListLike, ListAndListViewArrowTypes); -TYPED_TEST(TestIfElseList, ListOfInt) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfInt) { auto type = std::make_shared(int32()); CheckWithDifferentShapes(ArrayFromJSON(boolean(), "[true, true, false, false]"), ArrayFromJSON(type, "[[], null, [1, null], [2, 3]]"), @@ -755,7 +758,7 @@ TYPED_TEST(TestIfElseList, ListOfInt) { ArrayFromJSON(type, "[null, null, null, null]")); } -TYPED_TEST(TestIfElseList, ListOfString) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfString) { auto type = std::make_shared(utf8()); CheckWithDifferentShapes( ArrayFromJSON(boolean(), "[true, true, false, false]"), diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index 39936fbd7b5b2..1cdd92d5c42f2 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -398,7 +398,7 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { /// /// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`, /// which means the results from the underlying fragment generator will be - /// transfered to the default CPU thread pool. The generator itself is + /// transferred to the default CPU thread pool. The generator itself is /// offloaded to run on the default IO thread pool. virtual Result GetFragmentsAsyncImpl( compute::Expression predicate, arrow::internal::Executor* executor); diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc index a2096d691b4bf..ae9fb36484bb6 100644 --- a/cpp/src/arrow/dataset/dataset_writer.cc +++ b/cpp/src/arrow/dataset/dataset_writer.cc @@ -87,7 +87,7 @@ class Throttle { private: Future<> backpressure_ = Future<>::MakeFinished(); - uint64_t max_value_; + const uint64_t max_value_; uint64_t in_waiting_ = 0; uint64_t current_value_ = 0; std::mutex mutex_; @@ -621,11 +621,21 @@ class DatasetWriter::DatasetWriterImpl { backpressure = writer_state_.open_files_throttle.Acquire(1); if (!backpressure.is_finished()) { EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyOpenFiles"); + writer_state_.rows_in_flight_throttle.Release(next_chunk->num_rows()); RETURN_NOT_OK(TryCloseLargestFile()); break; } } - RETURN_NOT_OK(dir_queue->StartWrite(next_chunk)); + auto s = dir_queue->StartWrite(next_chunk); + if (!s.ok()) { + // If `StartWrite` succeeded, it will Release the + // `rows_in_flight_throttle` when the write task is finished. + // + // `open_files_throttle` will be handed by `DatasetWriterDirectoryQueue` + // so we don't need to release it here. + writer_state_.rows_in_flight_throttle.Release(next_chunk->num_rows()); + return s; + } batch = std::move(remainder); if (batch) { RETURN_NOT_OK(dir_queue->FinishCurrentFile()); @@ -647,6 +657,7 @@ class DatasetWriter::DatasetWriterImpl { DatasetWriterState writer_state_; std::function pause_callback_; std::function resume_callback_; + // Map from directory + prefix to the queue for that directory std::unordered_map> directory_queues_; std::mutex mutex_; diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc index c76e79d79b449..e62e779f71797 100644 --- a/cpp/src/arrow/dataset/dataset_writer_test.cc +++ b/cpp/src/arrow/dataset/dataset_writer_test.cc @@ -189,7 +189,8 @@ class DatasetWriterTestFixture : public testing::Test { } } - void AssertCreatedData(const std::vector& expected_files) { + void AssertCreatedData(const std::vector& expected_files, + bool check_num_record_batches = true) { counter_ = 0; for (const auto& expected_file : expected_files) { std::optional written_file = FindFile(expected_file.filename); @@ -197,7 +198,9 @@ class DatasetWriterTestFixture : public testing::Test { int num_batches = 0; AssertBatchesEqual(*MakeBatch(expected_file.start, expected_file.num_rows), *ReadAsBatch(written_file->data, &num_batches)); - ASSERT_EQ(expected_file.num_record_batches, num_batches); + if (check_num_record_batches) { + ASSERT_EQ(expected_file.num_record_batches, num_batches); + } } } @@ -277,6 +280,30 @@ TEST_F(DatasetWriterTestFixture, MaxRowsOneWrite) { {"testdir/chunk-3.arrow", 30, 5}}); } +TEST_F(DatasetWriterTestFixture, MaxRowsOneWriteBackpresure) { + // GH-38884: This test is to make sure that the writer can handle + // throttle resources in `WriteRecordBatch`. + + constexpr auto kFileSizeLimit = static_cast(10); + write_options_.max_rows_per_file = kFileSizeLimit; + write_options_.max_rows_per_group = kFileSizeLimit; + write_options_.max_open_files = 2; + write_options_.min_rows_per_group = kFileSizeLimit - 1; + auto dataset_writer = MakeDatasetWriter(/*max_rows=*/kFileSizeLimit); + for (int i = 0; i < 20; ++i) { + dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 5), ""); + } + EndWriterChecked(dataset_writer.get()); + std::vector expected_files; + for (int i = 0; i < 100; ++i) { + expected_files.emplace_back("testdir/chunk-" + std::to_string(i) + ".arrow", + kFileSizeLimit * i, kFileSizeLimit); + } + // Not checking the number of record batches because file may contain the + // zero-length record batch. + AssertCreatedData(expected_files, /*check_num_record_batches=*/false); +} + TEST_F(DatasetWriterTestFixture, MaxRowsOneWriteWithFunctor) { // Left padding with up to four zeros write_options_.max_rows_per_group = 10; diff --git a/cpp/src/arrow/dataset/file_json.cc b/cpp/src/arrow/dataset/file_json.cc index 6ca8405f03e2c..1d545c3969f6a 100644 --- a/cpp/src/arrow/dataset/file_json.cc +++ b/cpp/src/arrow/dataset/file_json.cc @@ -324,8 +324,8 @@ Result MakeBatchGenerator( const std::shared_ptr& file) { ARROW_ASSIGN_OR_RAISE(auto future, DoOpenReader(file->source(), format, scan_options)); auto maybe_reader = future.result(); - // Defer errors that occured during reader instantiation since they're likely related to - // batch-processing. + // Defer errors that occurred during reader instantiation since they're likely related + // to batch-processing. if (!maybe_reader.ok()) { return MakeFailingGenerator>(maybe_reader.status()); } diff --git a/cpp/src/arrow/dataset/file_json_test.cc b/cpp/src/arrow/dataset/file_json_test.cc index 2b4fcdd82f7ff..9626e8a5509df 100644 --- a/cpp/src/arrow/dataset/file_json_test.cc +++ b/cpp/src/arrow/dataset/file_json_test.cc @@ -162,7 +162,7 @@ std::shared_ptr ToFileSource(std::string json) { return std::make_shared(Buffer::FromString(std::move(json))); } -// Mixin for additional JSON-specific tests, compatibile with both format APIs. +// Mixin for additional JSON-specific tests, compatible with both format APIs. template class JsonScanMixin { public: @@ -245,7 +245,7 @@ class JsonScanMixin { // Use a reduced number of rows in valgrind to avoid timeouts. #ifndef ARROW_VALGRIND -constexpr static int64_t kTestMaxNumRows = json::kMaxParserNumRows; +constexpr static int64_t kTestMaxNumRows = (1UL << 17); #else constexpr static int64_t kTestMaxNumRows = 1024; #endif diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 65ad70181f28a..3afe4ec85cf49 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -98,6 +98,10 @@ parquet::ReaderProperties MakeReaderProperties( parquet_scan_options->reader_properties->thrift_string_size_limit()); properties.set_thrift_container_size_limit( parquet_scan_options->reader_properties->thrift_container_size_limit()); + + properties.set_page_checksum_verification( + parquet_scan_options->reader_properties->page_checksum_verification()); + return properties; } diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 5132a805bb4d6..f527ce5d70ae0 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -335,7 +335,7 @@ class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory { /// \brief Create a ParquetDatasetFactory from a metadata source. /// /// Similar to the previous Make definition, but the metadata can be a Buffer - /// and the base_path is explicited instead of inferred from the metadata + /// and the base_path is explicit instead of inferred from the metadata /// path. /// /// \param[in] metadata source to open the metadata parquet file from diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 1e932948676b2..7ec96929a9355 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -28,6 +28,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/dataset/dataset.h" #include "arrow/dataset/file_ipc.h" #include "arrow/dataset/test_util_internal.h" @@ -40,6 +41,8 @@ namespace arrow { +using compute::Cast; + using internal::checked_pointer_cast; namespace dataset { @@ -335,7 +338,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) { partitioning_ = std::make_shared( schema({field("year", int32()), field("month", int8()), field("day", temporal)})); - ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal)); + ASSERT_OK_AND_ASSIGN(auto day, Cast(StringScalar("2020-06-08"), temporal)); AssertParse("/2020/06/2020-06-08/", and_({equal(field_ref("year"), literal(2020)), equal(field_ref("month"), literal(6)), diff --git a/cpp/src/arrow/dataset/scan_node.cc b/cpp/src/arrow/dataset/scan_node.cc index 5ed6eee5ddf83..c25c5b70ae1ec 100644 --- a/cpp/src/arrow/dataset/scan_node.cc +++ b/cpp/src/arrow/dataset/scan_node.cc @@ -94,7 +94,7 @@ Future>> GetFragments( /// fragment on disk actually had a column x, and the value was not 7, then we will prefer /// the guarantee in this invalid case. /// -/// Ths next step is to fetch the metadata for the fragment. For some formats (e.g. +/// The next step is to fetch the metadata for the fragment. For some formats (e.g. /// CSV) this may be quite simple (get the size of the file). For other formats (e.g. /// parquet) this is more involved and requires reading data. There is one metadata /// io-task per fragment. The metadata io-task creates an AsyncGenerator @@ -150,7 +150,7 @@ class ScanNode : public acero::ExecNode, public acero::TracedNode { } if (normalized.filter.call() && normalized.filter.IsBound()) { - // There is no easy way to make sure a filter was bound agaisnt the same + // There is no easy way to make sure a filter was bound against the same // function registry as the one in ctx so we just require it to be unbound // FIXME - Do we care if it was bound to a different function registry? return Status::Invalid("Scan filter must be unbound"); diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 5479a0d9db404..4479158ff20cc 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -141,7 +141,7 @@ struct ARROW_DS_EXPORT ScanOptions { /// Scan-specific options, which can be changed between scans of the same dataset. /// /// A dataset consists of one or more individual fragments. A fragment is anything -/// that is indepedently scannable, often a file. +/// that is independently scannable, often a file. /// /// Batches from all fragments will be converted to a single schema. This unified /// schema is referred to as the "dataset schema" and is the output schema for @@ -230,7 +230,7 @@ struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions { /// for example, if scanning a parquet file that has batches with 100MiB of data /// then the actual readahead will be at least 100MiB /// - /// Set to 0 to disable readhead. When disabled, the scanner will read the + /// Set to 0 to disable readahead. When disabled, the scanner will read the /// dataset one batch at a time /// /// This limit applies across all fragments. If the limit is 32MiB and the diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index cde3a725c4663..fccfc80032d31 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -2591,7 +2591,7 @@ TEST(ScanNode, MinimalEndToEnd) { // for now, specify the projection as the full project expression (eventually this can // just be a list of materialized field names) compute::Expression a_times_2 = call("multiply", {field_ref("a"), literal(2)}); - // set the projection such that required project experssion field is included as a + // set the projection such that required project expression field is included as a // field_ref compute::Expression project_expr = field_ref("a"); options->projection = @@ -2686,7 +2686,7 @@ TEST(ScanNode, MinimalScalarAggEndToEnd) { // for now, specify the projection as the full project expression (eventually this can // just be a list of materialized field names) compute::Expression a_times_2 = call("multiply", {field_ref("a"), literal(2)}); - // set the projection such that required project experssion field is included as a + // set the projection such that required project expression field is included as a // field_ref compute::Expression project_expr = field_ref("a"); options->projection = @@ -2778,7 +2778,7 @@ TEST(ScanNode, MinimalGroupedAggEndToEnd) { // for now, specify the projection as the full project expression (eventually this can // just be a list of materialized field names) compute::Expression a_times_2 = call("multiply", {field_ref("a"), literal(2)}); - // set the projection such that required project experssion field is included as a + // set the projection such that required project expression field is included as a // field_ref compute::Expression a = field_ref("a"); compute::Expression b = field_ref("b"); @@ -2888,12 +2888,12 @@ TEST(ScanNode, OnlyLoadProjectedFields) { {acero::Declaration({"scan", dataset::ScanNodeOptions{dataset, scan_options}})}); ASSERT_OK_AND_ASSIGN(auto actual, acero::DeclarationToTable(declarations)); // Scan node always emits augmented fields so we drop those - ASSERT_OK_AND_ASSIGN(auto actualMinusAgumented, actual->SelectColumns({0, 1, 2})); + ASSERT_OK_AND_ASSIGN(auto actualMinusAugmented, actual->SelectColumns({0, 1, 2})); auto expected = TableFromJSON(dummy_schema, {R"([ [null, 1, null], [null, 4, null] ])"}); - AssertTablesEqual(*expected, *actualMinusAgumented, /*same_chunk_layout=*/false); + AssertTablesEqual(*expected, *actualMinusAugmented, /*same_chunk_layout=*/false); } } // namespace dataset diff --git a/cpp/src/arrow/dataset/subtree_test.cc b/cpp/src/arrow/dataset/subtree_test.cc index 75429a5fb7f95..fc13c20ecee49 100644 --- a/cpp/src/arrow/dataset/subtree_test.cc +++ b/cpp/src/arrow/dataset/subtree_test.cc @@ -133,7 +133,7 @@ void ExpectForestIs(std::vector infos, std::vector expected_roots) ASSERT_OK(forest.Visit( [&](Forest::Ref ref) -> Result { actual_roots.emplace_back(ref, infos); - return false; // only vist roots + return false; // only visit roots }, [](Forest::Ref) {})); diff --git a/cpp/src/arrow/dataset/test_util_internal.h b/cpp/src/arrow/dataset/test_util_internal.h index 51d39d532c82c..de0519afac9e1 100644 --- a/cpp/src/arrow/dataset/test_util_internal.h +++ b/cpp/src/arrow/dataset/test_util_internal.h @@ -1257,7 +1257,7 @@ class FileFormatScanNodeMixin : public FileFormatFixtureMixinV2, int64_t expected_batches() const { return GetParam().num_batches; } int64_t expected_rows() const { return GetParam().expected_rows(); } - // Override FileFormatFixtureMixin::GetRandomData to paramterize the # + // Override FileFormatFixtureMixin::GetRandomData to parameterize the # // of batches and rows per batch std::shared_ptr GetRandomData( std::shared_ptr schema) override { diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index d395261597696..5d892af9a394e 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -807,6 +807,14 @@ struct ScalarToProtoImpl { return Status::OK(); } + Status Visit(const ListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + + Status Visit(const LargeListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + Status Visit(const StructScalar& s) { lit_->set_allocated_struct_(new Lit::Struct()); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index d3fb058137e6a..f4a2e6800eb49 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -313,6 +313,10 @@ struct DataTypeToProtoImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { auto types = SetWithThen(&substrait::Type::set_allocated_struct_)->mutable_types(); diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fdf119477ab8b..daababb04c172 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" +#include "arrow/io/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" #include "arrow/util/formatting.h" @@ -38,12 +39,13 @@ namespace fs { // ----------------------------------------------------------------------- // AzureOptions Implementation -AzureOptions::AzureOptions() {} +AzureOptions::AzureOptions() = default; bool AzureOptions::Equals(const AzureOptions& other) const { return (account_dfs_url == other.account_dfs_url && account_blob_url == other.account_blob_url && - credentials_kind == other.credentials_kind); + credentials_kind == other.credentials_kind && + default_metadata == other.default_metadata); } Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_name, @@ -64,87 +66,94 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n namespace { -// An AzureFileSystem represents a single Azure storage account. AzurePath describes a -// container and path within that storage account. -struct AzurePath { - std::string full_path; +// An AzureFileSystem represents a single Azure storage +// account. AzureLocation describes a container and path within +// that storage account. +struct AzureLocation { + std::string all; std::string container; - std::string path_to_file; - std::vector path_to_file_parts; + std::string path; + std::vector path_parts; - static Result FromString(const std::string& s) { + static Result FromString(const std::string& string) { // Example expected string format: testcontainer/testdir/testfile.txt // container = testcontainer - // path_to_file = testdir/testfile.txt - // path_to_file_parts = [testdir, testfile.txt] - if (internal::IsLikelyUri(s)) { + // path = testdir/testfile.txt + // path_parts = [testdir, testfile.txt] + if (internal::IsLikelyUri(string)) { return Status::Invalid( - "Expected an Azure object path of the form 'container/path...', got a URI: '", - s, "'"); + "Expected an Azure object location of the form 'container/path...', got a URI: " + "'", + string, "'"); } - auto first_sep = s.find_first_of(internal::kSep); + auto first_sep = string.find_first_of(internal::kSep); if (first_sep == 0) { - return Status::Invalid("Path cannot start with a separator ('", s, "')"); + return Status::Invalid("Location cannot start with a separator ('", string, "')"); } if (first_sep == std::string::npos) { - return AzurePath{std::string(s), std::string(s), "", {}}; + return AzureLocation{string, string, "", {}}; } - AzurePath path; - path.full_path = std::string(s); - path.container = std::string(s.substr(0, first_sep)); - path.path_to_file = std::string(s.substr(first_sep + 1)); - path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file); - RETURN_NOT_OK(Validate(path)); - return path; + AzureLocation location; + location.all = string; + location.container = string.substr(0, first_sep); + location.path = string.substr(first_sep + 1); + location.path_parts = internal::SplitAbstractPath(location.path); + RETURN_NOT_OK(location.Validate()); + return location; } - static Status Validate(const AzurePath& path) { - auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts); - if (!status.ok()) { - return Status::Invalid(status.message(), " in path ", path.full_path); - } else { - return status; - } - } - - AzurePath parent() const { + AzureLocation parent() const { DCHECK(has_parent()); - auto parent = AzurePath{"", container, "", path_to_file_parts}; - parent.path_to_file_parts.pop_back(); - parent.path_to_file = internal::JoinAbstractPath(parent.path_to_file_parts); - if (parent.path_to_file.empty()) { - parent.full_path = parent.container; + AzureLocation parent{"", container, "", path_parts}; + parent.path_parts.pop_back(); + parent.path = internal::JoinAbstractPath(parent.path_parts); + if (parent.path.empty()) { + parent.all = parent.container; } else { - parent.full_path = parent.container + internal::kSep + parent.path_to_file; + parent.all = parent.container + internal::kSep + parent.path; } return parent; } - bool has_parent() const { return !path_to_file.empty(); } + Result join(const std::string& stem) const { + return FromString(internal::ConcatAbstractPath(all, stem)); + } + + bool has_parent() const { return !path.empty(); } - bool empty() const { return container.empty() && path_to_file.empty(); } + bool empty() const { return container.empty() && path.empty(); } - bool operator==(const AzurePath& other) const { - return container == other.container && path_to_file == other.path_to_file; + bool operator==(const AzureLocation& other) const { + return container == other.container && path == other.path; + } + + private: + Status Validate() { + auto status = internal::ValidateAbstractPathParts(path_parts); + if (!status.ok()) { + return Status::Invalid(status.message(), " in location ", all); + } else { + return status; + } } }; -Status PathNotFound(const AzurePath& path) { - return ::arrow::fs::internal::PathNotFound(path.full_path); +Status PathNotFound(const AzureLocation& location) { + return ::arrow::fs::internal::PathNotFound(location.all); } -Status NotAFile(const AzurePath& path) { - return ::arrow::fs::internal::NotAFile(path.full_path); +Status NotAFile(const AzureLocation& location) { + return ::arrow::fs::internal::NotAFile(location.all); } -Status ValidateFilePath(const AzurePath& path) { - if (path.container.empty()) { - return PathNotFound(path); +Status ValidateFileLocation(const AzureLocation& location) { + if (location.container.empty()) { + return PathNotFound(location); } - - if (path.path_to_file.empty()) { - return NotAFile(path); + if (location.path.empty()) { + return NotAFile(location); } + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(location.path)); return Status::OK(); } @@ -308,10 +317,11 @@ std::shared_ptr PropertiesToMetadata( class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile(std::shared_ptr blob_client, - const io::IOContext& io_context, AzurePath path, int64_t size = kNoSize) + const io::IOContext& io_context, AzureLocation location, + int64_t size = kNoSize) : blob_client_(std::move(blob_client)), io_context_(io_context), - path_(std::move(path)), + location_(std::move(location)), content_length_(size) {} Status Init() { @@ -326,11 +336,11 @@ class ObjectInputFile final : public io::RandomAccessFile { return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - return PathNotFound(path_); + return PathNotFound(location_); } return internal::ExceptionToStatus( "GetProperties failed for '" + blob_client_->GetUrl() + - "' with an unexpected Azure error. Can not initialise an ObjectInputFile " + "' with an unexpected Azure error. Cannot initialise an ObjectInputFile " "without knowing the file size.", exception); } @@ -451,13 +461,232 @@ class ObjectInputFile final : public io::RandomAccessFile { private: std::shared_ptr blob_client_; const io::IOContext io_context_; - AzurePath path_; + AzureLocation location_; bool closed_ = false; int64_t pos_ = 0; int64_t content_length_ = kNoSize; std::shared_ptr metadata_; }; + +Status CreateEmptyBlockBlob( + std::shared_ptr block_blob_client) { + try { + block_blob_client->UploadFrom(nullptr, 0); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "UploadFrom failed for '" + block_blob_client->GetUrl() + + "' with an unexpected Azure error. There is no existing blob at this " + "location or the existing blob must be replaced so ObjectAppendStream must " + "create a new empty block blob.", + exception); + } + return Status::OK(); +} + +Result GetBlockList( + std::shared_ptr block_blob_client) { + try { + return block_blob_client->GetBlockList().Value; + } catch (Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "GetBlockList failed for '" + block_blob_client->GetUrl() + + "' with an unexpected Azure error. Cannot write to a file without first " + "fetching the existing block list.", + exception); + } +} + +Azure::Storage::Metadata ArrowMetadataToAzureMetadata( + const std::shared_ptr& arrow_metadata) { + Azure::Storage::Metadata azure_metadata; + for (auto key_value : arrow_metadata->sorted_pairs()) { + azure_metadata[key_value.first] = key_value.second; + } + return azure_metadata; +} + +Status CommitBlockList( + std::shared_ptr block_blob_client, + const std::vector& block_ids, const Azure::Storage::Metadata& metadata) { + Azure::Storage::Blobs::CommitBlockListOptions options; + options.Metadata = metadata; + try { + // CommitBlockList puts all block_ids in the latest element. That means in the case of + // overlapping block_ids the newly staged block ids will always replace the + // previously committed blocks. + // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body + block_blob_client->CommitBlockList(block_ids, options); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "CommitBlockList failed for '" + block_blob_client->GetUrl() + + "' with an unexpected Azure error. Committing is required to flush an " + "output/append stream.", + exception); + } + return Status::OK(); +} + +class ObjectAppendStream final : public io::OutputStream { + public: + ObjectAppendStream( + std::shared_ptr block_blob_client, + const io::IOContext& io_context, const AzureLocation& location, + const std::shared_ptr& metadata, + const AzureOptions& options, int64_t size = kNoSize) + : block_blob_client_(std::move(block_blob_client)), + io_context_(io_context), + location_(location), + content_length_(size) { + if (metadata && metadata->size() != 0) { + metadata_ = ArrowMetadataToAzureMetadata(metadata); + } else if (options.default_metadata && options.default_metadata->size() != 0) { + metadata_ = ArrowMetadataToAzureMetadata(options.default_metadata); + } + } + + ~ObjectAppendStream() override { + // For compliance with the rest of the IO stack, Close rather than Abort, + // even though it may be more expensive. + io::internal::CloseFromDestructor(this); + } + + Status Init() { + if (content_length_ != kNoSize) { + DCHECK_GE(content_length_, 0); + pos_ = content_length_; + } else { + try { + auto properties = block_blob_client_->GetProperties(); + content_length_ = properties.Value.BlobSize; + pos_ = content_length_; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client_)); + } else { + return internal::ExceptionToStatus( + "GetProperties failed for '" + block_blob_client_->GetUrl() + + "' with an unexpected Azure error. Cannot initialise an " + "ObjectAppendStream without knowing whether a file already exists at " + "this path, and if it exists, its size.", + exception); + } + content_length_ = 0; + } + } + if (content_length_ > 0) { + ARROW_ASSIGN_OR_RAISE(auto block_list, GetBlockList(block_blob_client_)); + for (auto block : block_list.CommittedBlocks) { + block_ids_.push_back(block.Name); + } + } + return Status::OK(); + } + + Status Abort() override { + if (closed_) { + return Status::OK(); + } + block_blob_client_ = nullptr; + closed_ = true; + return Status::OK(); + } + + Status Close() override { + if (closed_) { + return Status::OK(); + } + RETURN_NOT_OK(Flush()); + block_blob_client_ = nullptr; + closed_ = true; + return Status::OK(); + } + + bool closed() const override { return closed_; } + + Status CheckClosed(const char* action) const { + if (closed_) { + return Status::Invalid("Cannot ", action, " on closed stream."); + } + return Status::OK(); + } + + Result Tell() const override { + RETURN_NOT_OK(CheckClosed("tell")); + return pos_; + } + + Status Write(const std::shared_ptr& buffer) override { + return DoAppend(buffer->data(), buffer->size(), buffer); + } + + Status Write(const void* data, int64_t nbytes) override { + return DoAppend(data, nbytes); + } + + Status Flush() override { + RETURN_NOT_OK(CheckClosed("flush")); + return CommitBlockList(block_blob_client_, block_ids_, metadata_); + } + + private: + Status DoAppend(const void* data, int64_t nbytes, + std::shared_ptr owned_buffer = nullptr) { + RETURN_NOT_OK(CheckClosed("append")); + auto append_data = reinterpret_cast(data); + Azure::Core::IO::MemoryBodyStream block_content(append_data, nbytes); + if (block_content.Length() == 0) { + return Status::OK(); + } + + const auto n_block_ids = block_ids_.size(); + + // New block ID must always be distinct from the existing block IDs. Otherwise we + // will accidentally replace the content of existing blocks, causing corruption. + // We will use monotonically increasing integers. + auto new_block_id = std::to_string(n_block_ids); + + // Pad to 5 digits, because Azure allows a maximum of 50,000 blocks. + const size_t target_number_of_digits = 5; + const auto required_padding_digits = + target_number_of_digits - std::min(target_number_of_digits, new_block_id.size()); + new_block_id.insert(0, required_padding_digits, '0'); + // There is a small risk when appending to a blob created by another client that + // `new_block_id` may overlapping with an existing block id. Adding the `-arrow` + // suffix significantly reduces the risk, but does not 100% eliminate it. For example + // if the blob was previously created with one block, with id `00001-arrow` then the + // next block we append will conflict with that, and cause corruption. + new_block_id += "-arrow"; + new_block_id = Azure::Core::Convert::Base64Encode( + std::vector(new_block_id.begin(), new_block_id.end())); + + try { + block_blob_client_->StageBlock(new_block_id, block_content); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "StageBlock failed for '" + block_blob_client_->GetUrl() + "' new_block_id: '" + + new_block_id + + "' with an unexpected Azure error. Staging new blocks is fundamental to " + "streaming writes to blob storage.", + exception); + } + block_ids_.push_back(new_block_id); + pos_ += nbytes; + content_length_ += nbytes; + return Status::OK(); + } + + std::shared_ptr block_blob_client_; + const io::IOContext io_context_; + const AzureLocation location_; + + bool closed_ = false; + int64_t pos_ = 0; + int64_t content_length_ = kNoSize; + std::vector block_ids_; + Azure::Storage::Metadata metadata_; +}; + } // namespace // ----------------------------------------------------------------------- @@ -488,22 +717,24 @@ class AzureFileSystem::Impl { const AzureOptions& options() const { return options_; } public: - Result GetFileInfo(const AzurePath& path) { + Result GetFileInfo(const AzureLocation& location) { FileInfo info; - info.set_path(path.full_path); - - if (path.container.empty()) { - DCHECK(path.path_to_file.empty()); // The path is invalid if the container is empty - // but not path_to_file. - // path must refer to the root of the Azure storage account. This is a directory, - // and there isn't any extra metadata to fetch. + info.set_path(location.all); + + if (location.container.empty()) { + // The location is invalid if the container is empty but not + // path. + DCHECK(location.path.empty()); + // The location must refer to the root of the Azure storage + // account. This is a directory, and there isn't any extra + // metadata to fetch. info.set_type(FileType::Directory); return info; } - if (path.path_to_file.empty()) { - // path refers to a container. This is a directory if it exists. + if (location.path.empty()) { + // The location refers to a container. This is a directory if it exists. auto container_client = - blob_service_client_->GetBlobContainerClient(path.container); + blob_service_client_->GetBlobContainerClient(location.container); try { auto properties = container_client.GetProperties(); info.set_type(FileType::Directory); @@ -522,13 +753,13 @@ class AzureFileSystem::Impl { exception); } } - auto file_client = datalake_service_client_->GetFileSystemClient(path.container) - .GetFileClient(path.path_to_file); + auto file_client = datalake_service_client_->GetFileSystemClient(location.container) + .GetFileClient(location.path); try { auto properties = file_client.GetProperties(); if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); - } else if (internal::HasTrailingSlash(path.path_to_file)) { + } else if (internal::HasTrailingSlash(location.path)) { // For a path with a trailing slash a hierarchical namespace may return a blob // with that trailing slash removed. For consistency with flat namespace and // other filesystems we chose to return NotFound. @@ -544,7 +775,7 @@ class AzureFileSystem::Impl { } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(path.container)); + hierarchical_namespace_.Enabled(location.container)); if (hierarchical_namespace_enabled) { // If the hierarchical namespace is enabled, then the storage account will have // explicit directories. Neither a file nor a directory was found. @@ -557,7 +788,7 @@ class AzureFileSystem::Impl { // If listing the prefix `path.path_to_file` with trailing slash returns at least // one result then `path` refers to an implied directory. - auto prefix = internal::EnsureTrailingSlash(path.path_to_file); + auto prefix = internal::EnsureTrailingSlash(location.path); list_blob_options.Prefix = prefix; // We only need to know if there is at least one result, so minimise page size // for efficiency. @@ -565,7 +796,7 @@ class AzureFileSystem::Impl { try { auto paged_list_result = - blob_service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(location.container) .ListBlobs(list_blob_options); if (paged_list_result.Blobs.size() > 0) { info.set_type(FileType::Directory); @@ -589,50 +820,250 @@ class AzureFileSystem::Impl { } } - Result> OpenInputFile(const std::string& s, + private: + template + Status VisitContainers(const Azure::Core::Context& context, + OnContainer&& on_container) const { + Azure::Storage::Blobs::ListBlobContainersOptions options; + try { + auto container_list_response = + blob_service_client_->ListBlobContainers(options, context); + for (; container_list_response.HasPage(); + container_list_response.MoveToNextPage(context)) { + for (const auto& container : container_list_response.BlobContainers) { + RETURN_NOT_OK(on_container(container)); + } + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus("Failed to list account containers.", exception); + } + return Status::OK(); + } + + static FileInfo FileInfoFromBlob(const std::string& container, + const Azure::Storage::Blobs::Models::BlobItem& blob) { + auto path = internal::ConcatAbstractPath(container, blob.Name); + if (internal::HasTrailingSlash(blob.Name)) { + return DirectoryFileInfoFromPath(path); + } + FileInfo info{std::move(path), FileType::File}; + info.set_size(blob.BlobSize); + info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); + return info; + } + + static FileInfo DirectoryFileInfoFromPath(const std::string& path) { + return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, + FileType::Directory}; + } + + static std::string_view BasenameView(std::string_view s) { + DCHECK(!internal::HasTrailingSlash(s)); + auto offset = s.find_last_of(internal::kSep); + auto result = (offset == std::string_view::npos) ? s : s.substr(offset); + DCHECK(!result.empty() && result.back() != internal::kSep); + return result; + } + + /// \brief List the blobs at the root of a container or some dir in a container. + /// + /// \pre container_client is the client for the container named like the first + /// segment of select.base_dir. + Status GetFileInfoWithSelectorFromContainer( + const Azure::Storage::Blobs::BlobContainerClient& container_client, + const Azure::Core::Context& context, Azure::Nullable page_size_hint, + const FileSelector& select, FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + bool found = false; + Azure::Storage::Blobs::ListBlobsOptions options; + if (internal::IsEmptyPath(base_location.path)) { + // If the base_dir is the root of the container, then we want to list all blobs in + // the container and the Prefix should be empty and not even include the trailing + // slash because the container itself represents the `/` directory. + options.Prefix = {}; + found = true; // Unless the container itself is not found later! + } else { + options.Prefix = internal::EnsureTrailingSlash(base_location.path); + } + options.PageSizeHint = page_size_hint; + options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; + + auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = internal::ConcatAbstractPath( + base_location.container, internal::RemoveTrailingSlash(blob_prefix)); + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + return GetFileInfoWithSelectorFromContainer( + container_client, context, page_size_hint, sub_select, acc_results); + } + return Status::OK(); + }; + + auto process_blob = + [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { + // blob.Name has trailing slash only when Prefix is an empty + // directory marker blob for the directory we're listing + // from, and we should skip it. + if (!internal::HasTrailingSlash(blob.Name)) { + acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); + } + }; + auto process_prefix = [&](const std::string& prefix) noexcept -> Status { + const auto path = internal::ConcatAbstractPath(base_location.container, prefix); + acc_results->push_back(DirectoryFileInfoFromPath(path)); + return recurse(prefix); + }; + + try { + auto list_response = + container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); + for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { + if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) { + continue; + } + found = true; + // Blob and BlobPrefixes are sorted by name, so we can merge-iterate + // them to ensure returned results are all sorted. + size_t blob_index = 0; + size_t blob_prefix_index = 0; + while (blob_index < list_response.Blobs.size() && + blob_prefix_index < list_response.BlobPrefixes.size()) { + const auto& blob = list_response.Blobs[blob_index]; + const auto& prefix = list_response.BlobPrefixes[blob_prefix_index]; + const int cmp = blob.Name.compare(prefix); + if (cmp < 0) { + process_blob(blob); + blob_index += 1; + } else if (cmp > 0) { + RETURN_NOT_OK(process_prefix(prefix)); + blob_prefix_index += 1; + } else { + DCHECK_EQ(blob.Name, prefix); + RETURN_NOT_OK(process_prefix(prefix)); + blob_index += 1; + blob_prefix_index += 1; + // If the container has an empty dir marker blob and another blob starting + // with this blob name as a prefix, the blob doesn't appear in the listing + // that also contains the prefix, so AFAICT this branch in unreachable. The + // code above is kept just in case, but if this DCHECK(false) is ever reached, + // we should refactor this loop to ensure no duplicate entries are ever + // reported. + DCHECK(false) + << "Unexpected blob/prefix name collision on the same listing request"; + } + } + for (; blob_index < list_response.Blobs.size(); blob_index++) { + process_blob(list_response.Blobs[blob_index]); + } + for (; blob_prefix_index < list_response.BlobPrefixes.size(); + blob_prefix_index++) { + RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); + } + } + } catch (const Azure::Storage::StorageException& exception) { + if (exception.ErrorCode == "ContainerNotFound") { + found = false; + } else { + return internal::ExceptionToStatus( + "Failed to list blobs in a directory: " + select.base_dir + ": " + + container_client.GetUrl(), + exception); + } + } + + return found || select.allow_not_found + ? Status::OK() + : ::arrow::fs::internal::PathNotFound(select.base_dir); + } + + public: + Status GetFileInfoWithSelector(const Azure::Core::Context& context, + Azure::Nullable page_size_hint, + const FileSelector& select, + FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + if (base_location.container.empty()) { + // Without a container, the base_location is equivalent to the filesystem + // root -- `/`. FileSelector::allow_not_found doesn't matter in this case + // because the root always exists. + auto on_container = + [&](const Azure::Storage::Blobs::Models::BlobContainerItem& container) { + // Deleted containers are not listed by ListContainers. + DCHECK(!container.IsDeleted); + + // Every container is considered a directory. + FileInfo info{container.Name, FileType::Directory}; + info.set_mtime( + std::chrono::system_clock::time_point{container.Details.LastModified}); + acc_results->push_back(std::move(info)); + + // Recurse into containers (subdirectories) if requested. + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = container.Name; + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context, page_size_hint, + sub_select, acc_results)); + } + return Status::OK(); + }; + return VisitContainers(context, std::move(on_container)); + } + + auto container_client = + blob_service_client_->GetBlobContainerClient(base_location.container); + return GetFileInfoWithSelectorFromContainer(container_client, context, page_size_hint, + select, acc_results); + } + + Result> OpenInputFile(const AzureLocation& location, AzureFileSystem* fs) { - ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); - ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); - RETURN_NOT_OK(ValidateFilePath(path)); + RETURN_NOT_OK(ValidateFileLocation(location)); auto blob_client = std::make_shared( - blob_service_client_->GetBlobContainerClient(path.container) - .GetBlobClient(path.path_to_file)); + blob_service_client_->GetBlobContainerClient(location.container) + .GetBlobClient(location.path)); - auto ptr = - std::make_shared(blob_client, fs->io_context(), std::move(path)); + auto ptr = std::make_shared(blob_client, fs->io_context(), + std::move(location)); RETURN_NOT_OK(ptr->Init()); return ptr; } Result> OpenInputFile(const FileInfo& info, AzureFileSystem* fs) { - ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path())); if (info.type() == FileType::NotFound) { return ::arrow::fs::internal::PathNotFound(info.path()); } if (info.type() != FileType::File && info.type() != FileType::Unknown) { return ::arrow::fs::internal::NotAFile(info.path()); } - ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); - RETURN_NOT_OK(ValidateFilePath(path)); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(info.path())); + RETURN_NOT_OK(ValidateFileLocation(location)); auto blob_client = std::make_shared( - blob_service_client_->GetBlobContainerClient(path.container) - .GetBlobClient(path.path_to_file)); + blob_service_client_->GetBlobContainerClient(location.container) + .GetBlobClient(location.path)); auto ptr = std::make_shared(blob_client, fs->io_context(), - std::move(path), info.size()); + std::move(location), info.size()); RETURN_NOT_OK(ptr->Init()); return ptr; } - Status CreateDir(const AzurePath& path) { - if (path.container.empty()) { + Status CreateDir(const AzureLocation& location) { + if (location.container.empty()) { return Status::Invalid("Cannot create an empty container"); } - if (path.path_to_file.empty()) { + if (location.path.empty()) { auto container_client = - blob_service_client_->GetBlobContainerClient(path.container); + blob_service_client_->GetBlobContainerClient(location.container); try { auto response = container_client.Create(); if (response.Value.Created) { @@ -640,18 +1071,18 @@ class AzureFileSystem::Impl { } else { return StatusFromErrorResponse( container_client.GetUrl(), response.RawResponse.get(), - "Failed to create a container: " + path.container); + "Failed to create a container: " + location.container); } } catch (const Azure::Storage::StorageException& exception) { return internal::ExceptionToStatus( - "Failed to create a container: " + path.container + ": " + + "Failed to create a container: " + location.container + ": " + container_client.GetUrl(), exception); } } ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(path.container)); + hierarchical_namespace_.Enabled(location.container)); if (!hierarchical_namespace_enabled) { // Without hierarchical namespace enabled Azure blob storage has no directories. // Therefore we can't, and don't need to create one. Simply creating a blob with `/` @@ -659,62 +1090,292 @@ class AzureFileSystem::Impl { return Status::OK(); } - auto directory_client = datalake_service_client_->GetFileSystemClient(path.container) - .GetDirectoryClient(path.path_to_file); + auto directory_client = + datalake_service_client_->GetFileSystemClient(location.container) + .GetDirectoryClient(location.path); try { auto response = directory_client.Create(); if (response.Value.Created) { return Status::OK(); } else { - return StatusFromErrorResponse( - directory_client.GetUrl(), response.RawResponse.get(), - "Failed to create a directory: " + path.path_to_file); + return StatusFromErrorResponse(directory_client.GetUrl(), + response.RawResponse.get(), + "Failed to create a directory: " + location.path); } } catch (const Azure::Storage::StorageException& exception) { return internal::ExceptionToStatus( - "Failed to create a directory: " + path.path_to_file + ": " + + "Failed to create a directory: " + location.path + ": " + directory_client.GetUrl(), exception); } } - Status CreateDirRecursive(const AzurePath& path) { - if (path.container.empty()) { + Status CreateDirRecursive(const AzureLocation& location) { + if (location.container.empty()) { return Status::Invalid("Cannot create an empty container"); } - auto container_client = blob_service_client_->GetBlobContainerClient(path.container); + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); try { container_client.CreateIfNotExists(); } catch (const Azure::Storage::StorageException& exception) { return internal::ExceptionToStatus( - "Failed to create a container: " + path.container + " (" + + "Failed to create a container: " + location.container + " (" + container_client.GetUrl() + ")", exception); } ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, - hierarchical_namespace_.Enabled(path.container)); + hierarchical_namespace_.Enabled(location.container)); if (!hierarchical_namespace_enabled) { - // We can't create a directory without hierarchical namespace - // support. There is only "virtual directory" without - // hierarchical namespace support. And a "virtual directory" is - // (virtually) created a blob with ".../.../blob" blob name - // automatically. + // Without hierarchical namespace enabled Azure blob storage has no directories. + // Therefore we can't, and don't need to create one. Simply creating a blob with `/` + // in the name implies directories. return Status::OK(); } - auto directory_client = datalake_service_client_->GetFileSystemClient(path.container) - .GetDirectoryClient(path.path_to_file); + if (!location.path.empty()) { + auto directory_client = + datalake_service_client_->GetFileSystemClient(location.container) + .GetDirectoryClient(location.path); + try { + directory_client.CreateIfNotExists(); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to create a directory: " + location.path + " (" + + directory_client.GetUrl() + ")", + exception); + } + } + + return Status::OK(); + } + + Result> OpenAppendStream( + const AzureLocation& location, + const std::shared_ptr& metadata, const bool truncate, + AzureFileSystem* fs) { + RETURN_NOT_OK(ValidateFileLocation(location)); + + auto block_blob_client = std::make_shared( + blob_service_client_->GetBlobContainerClient(location.container) + .GetBlockBlobClient(location.path)); + + std::shared_ptr stream; + if (truncate) { + RETURN_NOT_OK(CreateEmptyBlockBlob(block_blob_client)); + stream = std::make_shared(block_blob_client, fs->io_context(), + location, metadata, options_, 0); + } else { + stream = std::make_shared(block_blob_client, fs->io_context(), + location, metadata, options_); + } + RETURN_NOT_OK(stream->Init()); + return stream; + } + + private: + Status DeleteDirContentsWithoutHierarchicalNamespace(const AzureLocation& location, + bool missing_dir_ok) { + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + Azure::Storage::Blobs::ListBlobsOptions options; + if (!location.path.empty()) { + options.Prefix = internal::EnsureTrailingSlash(location.path); + } + // https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch#remarks + // + // Only supports up to 256 subrequests in a single batch. The + // size of the body for a batch request can't exceed 4 MB. + const int32_t kNumMaxRequestsInBatch = 256; + options.PageSizeHint = kNumMaxRequestsInBatch; try { - directory_client.CreateIfNotExists(); + auto list_response = container_client.ListBlobs(options); + if (!missing_dir_ok && list_response.Blobs.empty()) { + return PathNotFound(location); + } + for (; list_response.HasPage(); list_response.MoveToNextPage()) { + if (list_response.Blobs.empty()) { + continue; + } + auto batch = container_client.CreateBatch(); + std::vector> + deferred_responses; + for (const auto& blob_item : list_response.Blobs) { + deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); + } + try { + container_client.SubmitBatch(batch); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete blobs in a directory: " + location.path + ": " + + container_client.GetUrl(), + exception); + } + std::vector failed_blob_names; + for (size_t i = 0; i < deferred_responses.size(); ++i) { + const auto& deferred_response = deferred_responses[i]; + bool success = true; + try { + auto delete_result = deferred_response.GetResponse(); + success = delete_result.Value.Deleted; + } catch (const Azure::Storage::StorageException& exception) { + success = false; + } + if (!success) { + const auto& blob_item = list_response.Blobs[i]; + failed_blob_names.push_back(blob_item.Name); + } + } + if (!failed_blob_names.empty()) { + if (failed_blob_names.size() == 1) { + return Status::IOError("Failed to delete a blob: ", failed_blob_names[0], + ": " + container_client.GetUrl()); + } else { + return Status::IOError("Failed to delete blobs: [", + arrow::internal::JoinStrings(failed_blob_names, ", "), + "]: " + container_client.GetUrl()); + } + } + } } catch (const Azure::Storage::StorageException& exception) { return internal::ExceptionToStatus( - "Failed to create a directory: " + path.path_to_file + " (" + - directory_client.GetUrl() + ")", + "Failed to list blobs in a directory: " + location.path + ": " + + container_client.GetUrl(), exception); } + return Status::OK(); + } + + public: + Status DeleteDir(const AzureLocation& location) { + if (location.container.empty()) { + return Status::Invalid("Cannot delete an empty container"); + } + + if (location.path.empty()) { + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + try { + auto response = container_client.Delete(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + container_client.GetUrl(), response.RawResponse.get(), + "Failed to delete a container: " + location.container); + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a container: " + location.container + ": " + + container_client.GetUrl(), + exception); + } + } + + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(location.container)); + if (hierarchical_namespace_enabled) { + auto directory_client = + datalake_service_client_->GetFileSystemClient(location.container) + .GetDirectoryClient(location.path); + try { + auto response = directory_client.DeleteRecursive(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + directory_client.GetUrl(), response.RawResponse.get(), + "Failed to delete a directory: " + location.path); + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a directory: " + location.path + ": " + + directory_client.GetUrl(), + exception); + } + } else { + return DeleteDirContentsWithoutHierarchicalNamespace(location, + /*missing_dir_ok=*/true); + } + } + + Status DeleteDirContents(const AzureLocation& location, bool missing_dir_ok) { + if (location.container.empty()) { + return internal::InvalidDeleteDirContents(location.all); + } + + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(location.container)); + if (hierarchical_namespace_enabled) { + auto file_system_client = + datalake_service_client_->GetFileSystemClient(location.container); + auto directory_client = file_system_client.GetDirectoryClient(location.path); + try { + auto list_response = directory_client.ListPaths(false); + for (; list_response.HasPage(); list_response.MoveToNextPage()) { + for (const auto& path : list_response.Paths) { + if (path.IsDirectory) { + auto sub_directory_client = + file_system_client.GetDirectoryClient(path.Name); + try { + sub_directory_client.DeleteRecursive(); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a sub directory: " + location.container + + internal::kSep + path.Name + ": " + sub_directory_client.GetUrl(), + exception); + } + } else { + auto sub_file_client = file_system_client.GetFileClient(path.Name); + try { + sub_file_client.Delete(); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a sub file: " + location.container + + internal::kSep + path.Name + ": " + sub_file_client.GetUrl(), + exception); + } + } + } + } + } catch (const Azure::Storage::StorageException& exception) { + if (missing_dir_ok && + exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + return Status::OK(); + } else { + return internal::ExceptionToStatus( + "Failed to delete directory contents: " + location.path + ": " + + directory_client.GetUrl(), + exception); + } + } + return Status::OK(); + } else { + return DeleteDirContentsWithoutHierarchicalNamespace(location, missing_dir_ok); + } + } + Status CopyFile(const AzureLocation& src, const AzureLocation& dest) { + RETURN_NOT_OK(ValidateFileLocation(src)); + RETURN_NOT_OK(ValidateFileLocation(dest)); + if (src == dest) { + return Status::OK(); + } + auto dest_blob_client = blob_service_client_->GetBlobContainerClient(dest.container) + .GetBlobClient(dest.path); + auto src_url = blob_service_client_->GetBlobContainerClient(src.container) + .GetBlobClient(src.path) + .GetUrl(); + try { + dest_blob_client.CopyFromUri(src_url); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to copy a blob. (" + src_url + " -> " + dest_blob_client.GetUrl() + ")", + exception); + } return Status::OK(); } }; @@ -733,29 +1394,36 @@ bool AzureFileSystem::Equals(const FileSystem& other) const { } Result AzureFileSystem::GetFileInfo(const std::string& path) { - ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path)); - return impl_->GetFileInfo(p); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->GetFileInfo(location); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + Azure::Core::Context context; + Azure::Nullable page_size_hint; // unspecified + FileInfoVector results; + RETURN_NOT_OK( + impl_->GetFileInfoWithSelector(context, page_size_hint, select, &results)); + return {std::move(results)}; } Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { - ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path)); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); if (recursive) { - return impl_->CreateDirRecursive(p); + return impl_->CreateDirRecursive(location); } else { - return impl_->CreateDir(p); + return impl_->CreateDir(location); } } Status AzureFileSystem::DeleteDir(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->DeleteDir(location); } Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->DeleteDirContents(location, missing_dir_ok); } Status AzureFileSystem::DeleteRootDirContents() { @@ -771,12 +1439,15 @@ Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { } Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto src_location, AzureLocation::FromString(src)); + ARROW_ASSIGN_OR_RAISE(auto dest_location, AzureLocation::FromString(dest)); + return impl_->CopyFile(src_location, dest_location); } Result> AzureFileSystem::OpenInputStream( const std::string& path) { - return impl_->OpenInputFile(path, this); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->OpenInputFile(location, this); } Result> AzureFileSystem::OpenInputStream( @@ -786,7 +1457,8 @@ Result> AzureFileSystem::OpenInputStream( Result> AzureFileSystem::OpenInputFile( const std::string& path) { - return impl_->OpenInputFile(path, this); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->OpenInputFile(location, this); } Result> AzureFileSystem::OpenInputFile( @@ -796,12 +1468,14 @@ Result> AzureFileSystem::OpenInputFile( Result> AzureFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->OpenAppendStream(location, metadata, true, this); } Result> AzureFileSystem::OpenAppendStream( - const std::string&, const std::shared_ptr&) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + const std::string& path, const std::shared_ptr& metadata) { + ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); + return impl_->OpenAppendStream(location, metadata, false, this); } Result> AzureFileSystem::Make( diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 1f7047ff94c56..b2865b059ef6e 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -77,6 +77,11 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr service_principle_credentials_provider; + /// \brief Default metadata for OpenOutputStream. + /// + /// This will be ignored if non-empty metadata is passed to OpenOutputStream. + std::shared_ptr default_metadata; + AzureOptions(); Status ConfigureAccountKeyCredentials(const std::string& account_name, @@ -152,7 +157,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { const AzureOptions& options, const io::IOContext& = io::default_io_context()); private: - explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); + AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); class Impl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ecf0a19f684eb..792c63b209402 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -56,6 +56,7 @@ #include "arrow/testing/util.h" #include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" @@ -69,6 +70,9 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; +namespace Blobs = Azure::Storage::Blobs; +namespace Files = Azure::Storage::Files; + auto const* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis @@ -92,9 +96,15 @@ class AzuriteEnv : public ::testing::Environment { return; } auto temp_dir_ = *TemporaryDir::Make("azurefs-test-"); - server_process_ = bp::child(boost::this_process::environment(), exe_path, "--silent", - "--location", temp_dir_->path().ToString(), "--debug", - temp_dir_->path().ToString() + "/debug.log"); + auto debug_log_path_result = temp_dir_->path().Join("debug.log"); + if (!debug_log_path_result.ok()) { + status_ = debug_log_path_result.status(); + return; + } + debug_log_path_ = *debug_log_path_result; + server_process_ = + bp::child(boost::this_process::environment(), exe_path, "--silent", "--location", + temp_dir_->path().ToString(), "--debug", debug_log_path_.ToString()); if (!(server_process_.valid() && server_process_.running())) { auto error = "Could not start Azurite emulator."; server_process_.terminate(); @@ -110,6 +120,44 @@ class AzuriteEnv : public ::testing::Environment { server_process_.wait(); } + Result GetDebugLogSize() { + ARROW_ASSIGN_OR_RAISE(auto exists, arrow::internal::FileExists(debug_log_path_)); + if (!exists) { + return 0; + } + ARROW_ASSIGN_OR_RAISE(auto file_descriptor, + arrow::internal::FileOpenReadable(debug_log_path_)); + ARROW_RETURN_NOT_OK(arrow::internal::FileSeek(file_descriptor.fd(), 0, SEEK_END)); + return arrow::internal::FileTell(file_descriptor.fd()); + } + + Status DumpDebugLog(int64_t position = 0) { + ARROW_ASSIGN_OR_RAISE(auto exists, arrow::internal::FileExists(debug_log_path_)); + if (!exists) { + return Status::OK(); + } + ARROW_ASSIGN_OR_RAISE(auto file_descriptor, + arrow::internal::FileOpenReadable(debug_log_path_)); + if (position > 0) { + ARROW_RETURN_NOT_OK(arrow::internal::FileSeek(file_descriptor.fd(), position)); + } + std::vector buffer; + const int64_t buffer_size = 4096; + buffer.reserve(buffer_size); + while (true) { + ARROW_ASSIGN_OR_RAISE( + auto n_read_bytes, + arrow::internal::FileRead(file_descriptor.fd(), buffer.data(), buffer_size)); + if (n_read_bytes <= 0) { + break; + } + std::cerr << std::string_view(reinterpret_cast(buffer.data()), + n_read_bytes); + } + std::cerr << std::endl; + return Status::OK(); + } + const std::string& account_name() const { return account_name_; } const std::string& account_key() const { return account_key_; } const Status status() const { return status_; } @@ -120,6 +168,7 @@ class AzuriteEnv : public ::testing::Environment { bp::child server_process_; Status status_; std::unique_ptr temp_dir_; + arrow::internal::PlatformFilename debug_log_path_; }; auto* azurite_env = ::testing::AddGlobalTestEnvironment(new AzuriteEnv); @@ -147,9 +196,8 @@ TEST(AzureFileSystem, OptionsCompare) { class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::unique_ptr blob_service_client_; - std::unique_ptr - datalake_service_client_; + std::unique_ptr blob_service_client_; + std::unique_ptr datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -167,15 +215,14 @@ class AzureFileSystemTest : public ::testing::Test { suite_skipped_ = true; GTEST_SKIP() << options.status().message(); } - container_name_ = RandomChars(32); - blob_service_client_ = std::make_unique( + // Stop-gap solution before GH-39119 is fixed. + container_name_ = "z" + RandomChars(31); + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); - datalake_service_client_ = - std::make_unique( - options_.account_dfs_url, options_.storage_credentials_provider); + datalake_service_client_ = std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); - auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); - container_client.CreateIfNotExists(); + auto container_client = CreateContainer(container_name_); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), @@ -193,6 +240,20 @@ class AzureFileSystemTest : public ::testing::Test { } } + Blobs::BlobContainerClient CreateContainer(const std::string& name) { + auto container_client = blob_service_client_->GetBlobContainerClient(name); + (void)container_client.CreateIfNotExists(); + return container_client; + } + + Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client, + const std::string& name, const std::string& data = "") { + auto blob_client = container_client.GetBlockBlobClient(name); + (void)blob_client.UploadFrom(reinterpret_cast(data.data()), + data.size()); + return blob_client; + } + std::string PreexistingContainerName() const { return container_name_; } std::string PreexistingContainerPath() const { @@ -232,29 +293,118 @@ class AzureFileSystemTest : public ::testing::Test { void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - auto blob_client = - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); - std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); - blob_client.UploadFrom(reinterpret_cast(all_lines.data()), - total_size); + const auto path = PreexistingContainerPath() + path_to_file; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); + ASSERT_OK(output->Write(all_lines)); + ASSERT_OK(output->Close()); } void RunGetFileInfoObjectWithNestedStructureTest(); void RunGetFileInfoObjectTest(); + + struct HierarchicalPaths { + std::string container; + std::string directory; + std::vector sub_paths; + }; + + // Need to use "void" as the return type to use ASSERT_* in this method. + void CreateHierarchicalData(HierarchicalPaths& paths) { + const auto container_path = RandomContainerName(); + const auto directory_path = + internal::ConcatAbstractPath(container_path, RandomDirectoryName()); + const auto sub_directory_path = + internal::ConcatAbstractPath(directory_path, "new-sub"); + const auto sub_blob_path = + internal::ConcatAbstractPath(sub_directory_path, "sub.txt"); + const auto top_blob_path = internal::ConcatAbstractPath(directory_path, "top.txt"); + ASSERT_OK(fs_->CreateDir(sub_directory_path, true)); + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(sub_blob_path)); + ASSERT_OK(output->Write(std::string_view("sub"))); + ASSERT_OK(output->Close()); + ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(top_blob_path)); + ASSERT_OK(output->Write(std::string_view("top"))); + ASSERT_OK(output->Close()); + + AssertFileInfo(fs_.get(), container_path, FileType::Directory); + AssertFileInfo(fs_.get(), directory_path, FileType::Directory); + AssertFileInfo(fs_.get(), sub_directory_path, FileType::Directory); + AssertFileInfo(fs_.get(), sub_blob_path, FileType::File); + AssertFileInfo(fs_.get(), top_blob_path, FileType::File); + + paths.container = container_path; + paths.directory = directory_path; + paths.sub_paths = { + sub_directory_path, + sub_blob_path, + top_blob_path, + }; + } + + char const* kSubData = "sub data"; + char const* kSomeData = "some data"; + char const* kOtherData = "other data"; + + void SetUpSmallFileSystemTree() { + // Set up test containers + CreateContainer("empty-container"); + auto container = CreateContainer("container"); + + CreateBlob(container, "emptydir/"); + CreateBlob(container, "somedir/subdir/subfile", kSubData); + CreateBlob(container, "somefile", kSomeData); + // Add an explicit marker for a non-empty directory. + CreateBlob(container, "otherdir/1/2/"); + // otherdir/{1/,2/,3/} are implicitly assumed to exist because of + // the otherdir/1/2/3/otherfile blob. + CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData); + } + + void AssertInfoAllContainersRecursive(const std::vector& infos) { + ASSERT_EQ(infos.size(), 14); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", FileType::File, + strlen(kOtherData)); + AssertFileInfo(infos[7], "container/somedir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[9], "container/somedir/subdir/subfile", FileType::File, + strlen(kSubData)); + AssertFileInfo(infos[10], "container/somefile", FileType::File, strlen(kSomeData)); + AssertFileInfo(infos[11], "empty-container", FileType::Directory); + AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory); + AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File); + } }; class AzuriteFileSystemTest : public AzureFileSystemTest { - Result MakeOptions() { + Result MakeOptions() override { EXPECT_THAT(GetAzuriteEnv(), NotNull()); ARROW_EXPECT_OK(GetAzuriteEnv()->status()); + ARROW_ASSIGN_OR_RAISE(debug_log_start_, GetAzuriteEnv()->GetDebugLogSize()); AzureOptions options; options.backend = AzureBackend::Azurite; ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); return options; } + + void TearDown() override { + AzureFileSystemTest::TearDown(); + if (HasFailure()) { + // XXX: This may not include all logs in the target test because + // Azurite doesn't flush debug logs immediately... You may want + // to check the log manually... + ARROW_IGNORE_EXPR(GetAzuriteEnv()->DumpDebugLog(debug_log_start_)); + } + } + + int64_t debug_log_start_ = 0; }; class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { @@ -324,7 +474,7 @@ TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); - ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); + ASSERT_NOT_OK(hierarchical_namespace.Enabled("nonexistent-container")); } TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { @@ -337,7 +487,7 @@ TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); - AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); + AssertFileInfo(fs_.get(), "nonexistent-container", FileType::NotFound); // URI ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); @@ -347,21 +497,26 @@ void AzureFileSystemTest::RunGetFileInfoObjectWithNestedStructureTest() { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(kObjectName) - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + ASSERT_OK_AND_ASSIGN( + auto output, + fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName, /*metadata=*/{})); + const std::string_view data(kLoremIpsum); + ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient("test-object-dir/some_other_dir0") - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); - - blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(std::string(kObjectName) + "0") - .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + ASSERT_OK_AND_ASSIGN(output, + fs_->OpenOutputStream( + PreexistingContainerPath() + "test-object-dir/some_other_dir0", + /*metadata=*/{})); + ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Close()); + ASSERT_OK_AND_ASSIGN( + output, fs_->OpenOutputStream(PreexistingContainerPath() + kObjectName + "0", + /*metadata=*/{})); + ASSERT_OK(output->Write(data)); + ASSERT_OK(output->Close()); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", @@ -417,6 +572,180 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } +TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { + SetUpSmallFileSystemTree(); + + FileSelector select; + std::vector infos; + + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 3); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "empty-container", FileType::Directory); + AssertFileInfo(infos[2], container_name_, FileType::Directory); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Nonexistent container + select.base_dir = "nonexistent-container"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/somedir", FileType::Directory); + AssertFileInfo(infos[3], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + select.base_dir = "container/somedir/subdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); + // Nonexistent + select.base_dir = "container/nonexistent"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + + // Trailing slashes + select.base_dir = "empty-container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.base_dir = "nonexistent-container/"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.base_dir = "container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { + SetUpSmallFileSystemTree(); + + FileSelector select; + select.recursive = true; + + std::vector infos; + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 14); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertInfoAllContainersRecursive(infos); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 10); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile", FileType::File, 10); + AssertFileInfo(infos[6], "container/somedir", FileType::Directory); + AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File, 8); + AssertFileInfo(infos[9], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 2); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); + + select.base_dir = "container/otherdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) { + { + auto container = CreateContainer("container"); + CreateBlob(container, "mydir/emptydir1/"); + CreateBlob(container, "mydir/emptydir2/"); + CreateBlob(container, "mydir/nonemptydir1/"); // explicit dir marker + CreateBlob(container, "mydir/nonemptydir1/somefile", kSomeData); + CreateBlob(container, "mydir/nonemptydir2/somefile", kSomeData); + } + std::vector infos; + + FileSelector select; // non-recursive + select.base_dir = "container"; + + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir", FileType::Directory); + + select.base_dir = "container/mydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 4); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); + AssertFileInfo(infos[1], "container/mydir/emptydir2", FileType::Directory); + AssertFileInfo(infos[2], "container/mydir/nonemptydir1", FileType::Directory); + AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); + + select.base_dir = "container/mydir/emptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/emptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/nonemptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); + + select.base_dir = "container/mydir/nonemptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); +} + TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); } @@ -507,6 +836,214 @@ TEST_F(AzuriteFileSystemTest, CreateDirUri) { ASSERT_RAISES(Invalid, fs_->CreateDir("abfs://" + RandomContainerName(), true)); } +TEST_F(AzuriteFileSystemTest, DeleteDirSuccessContainer) { + const auto container_name = RandomContainerName(); + ASSERT_OK(fs_->CreateDir(container_name)); + arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs_->DeleteDir(container_name)); + arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, DeleteDirSuccessEmpty) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + // There is only virtual directory without hierarchical namespace + // support. So the CreateDir() and DeleteDir() do nothing. + ASSERT_OK(fs_->CreateDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, DeleteDirSuccessNonexistent) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + // There is only virtual directory without hierarchical namespace + // support. So the DeleteDir() for nonexistent directory does nothing. + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, DeleteDirSuccessHaveBlobs) { +#ifdef __APPLE__ + GTEST_SKIP() << "This test fails by an Azurite problem: " + "https://github.com/Azure/Azurite/pull/2302"; +#endif + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + // We must use 257 or more blobs here to test pagination of ListBlobs(). + // Because we can't add 257 or more delete blob requests to one SubmitBatch(). + int64_t n_blobs = 257; + for (int64_t i = 0; i < n_blobs; ++i) { + const auto blob_path = + internal::ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write(std::string_view(std::to_string(i)))); + ASSERT_OK(output->Close()); + arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); + } + ASSERT_OK(fs_->DeleteDir(directory_path)); + for (int64_t i = 0; i < n_blobs; ++i) { + const auto blob_path = + internal::ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); + arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + } +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirSuccessEmpty) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + ASSERT_OK(fs_->CreateDir(directory_path, true)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::Directory); + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirFailureNonexistent) { + const auto path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + ASSERT_RAISES(IOError, fs_->DeleteDir(path)); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirSuccessHaveBlob) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + const auto blob_path = internal::ConcatAbstractPath(directory_path, "hello.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write(std::string_view("hello"))); + ASSERT_OK(output->Close()); + arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); + ASSERT_OK(fs_->DeleteDir(directory_path)); + arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirSuccessHaveDirectory) { + const auto parent = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + const auto path = internal::ConcatAbstractPath(parent, "new-sub"); + ASSERT_OK(fs_->CreateDir(path, true)); + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + ASSERT_OK(fs_->DeleteDir(parent)); + arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, DeleteDirUri) { + ASSERT_RAISES(Invalid, fs_->DeleteDir("abfs://" + PreexistingContainerPath())); +} + +TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessContainer) { +#ifdef __APPLE__ + GTEST_SKIP() << "This test fails by an Azurite problem: " + "https://github.com/Azure/Azurite/pull/2302"; +#endif + HierarchicalPaths paths; + CreateHierarchicalData(paths); + ASSERT_OK(fs_->DeleteDirContents(paths.container)); + arrow::fs::AssertFileInfo(fs_.get(), paths.container, FileType::Directory); + arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + for (const auto& sub_path : paths.sub_paths) { + arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + } +} + +TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessDirectory) { +#ifdef __APPLE__ + GTEST_SKIP() << "This test fails by an Azurite problem: " + "https://github.com/Azure/Azurite/pull/2302"; +#endif + HierarchicalPaths paths; + CreateHierarchicalData(paths); + ASSERT_OK(fs_->DeleteDirContents(paths.directory)); + // GH-38772: We may change this to FileType::Directory. + arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + for (const auto& sub_path : paths.sub_paths) { + arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + } +} + +TEST_F(AzuriteFileSystemTest, DeleteDirContentsSuccessNonexistent) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, DeleteDirContentsFailureNonexistent) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirContentsSuccessExist) { + HierarchicalPaths paths; + CreateHierarchicalData(paths); + ASSERT_OK(fs_->DeleteDirContents(paths.directory)); + arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::Directory); + for (const auto& sub_path : paths.sub_paths) { + arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + } +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirContentsSuccessNonexistent) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); + arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DeleteDirContentsFailureNonexistent) { + const auto directory_path = + internal::ConcatAbstractPath(PreexistingContainerName(), RandomDirectoryName()); + ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); +} + +TEST_F(AzuriteFileSystemTest, CopyFileSuccessDestinationNonexistent) { + const auto destination_path = + internal::ConcatAbstractPath(PreexistingContainerName(), "copy-destionation"); + ASSERT_OK(fs_->CopyFile(PreexistingObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(kLoremIpsum, buffer->ToString()); +} + +TEST_F(AzuriteFileSystemTest, CopyFileSuccessDestinationSame) { + ASSERT_OK(fs_->CopyFile(PreexistingObjectPath(), PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(kLoremIpsum, buffer->ToString()); +} + +TEST_F(AzuriteFileSystemTest, CopyFileFailureDestinationTrailingSlash) { + ASSERT_RAISES(IOError, + fs_->CopyFile(PreexistingObjectPath(), + internal::EnsureTrailingSlash(PreexistingObjectPath()))); +} + +TEST_F(AzuriteFileSystemTest, CopyFileFailureSourceNonexistent) { + const auto destination_path = + internal::ConcatAbstractPath(PreexistingContainerName(), "copy-destionation"); + ASSERT_RAISES(IOError, fs_->CopyFile(NotFoundObjectPath(), destination_path)); +} + +TEST_F(AzuriteFileSystemTest, CopyFileFailureDestinationParentNonexistent) { + const auto destination_path = + internal::ConcatAbstractPath(RandomContainerName(), "copy-destionation"); + ASSERT_RAISES(IOError, fs_->CopyFile(PreexistingObjectPath(), destination_path)); +} + +TEST_F(AzuriteFileSystemTest, CopyFileUri) { + const auto destination_path = + internal::ConcatAbstractPath(PreexistingContainerName(), "copy-destionation"); + ASSERT_RAISES(Invalid, + fs_->CopyFile("abfs://" + PreexistingObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, + fs_->CopyFile(PreexistingObjectPath(), "abfs://" + destination_path)); +} + TEST_F(AzuriteFileSystemTest, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -647,6 +1184,157 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { ASSERT_RAISES(Invalid, stream->Tell()); } +TEST_F(AzuriteFileSystemTest, TestWriteMetadata) { + options_.default_metadata = arrow::key_value_metadata({{"foo", "bar"}}); + + ASSERT_OK_AND_ASSIGN(auto fs_with_defaults, AzureFileSystem::Make(options_)); + std::string path = "object_with_defaults"; + auto location = PreexistingContainerPath() + path; + ASSERT_OK_AND_ASSIGN(auto output, + fs_with_defaults->OpenOutputStream(location, /*metadata=*/{})); + const std::string_view expected(kLoremIpsum); + ASSERT_OK(output->Write(expected)); + ASSERT_OK(output->Close()); + + // Verify the metadata has been set. + auto blob_metadata = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path) + .GetProperties() + .Value.Metadata; + EXPECT_EQ(Azure::Core::CaseInsensitiveMap{std::make_pair("foo", "bar")}, blob_metadata); + + // Check that explicit metadata overrides the defaults. + ASSERT_OK_AND_ASSIGN( + output, fs_with_defaults->OpenOutputStream( + location, /*metadata=*/arrow::key_value_metadata({{"bar", "foo"}}))); + ASSERT_OK(output->Write(expected)); + ASSERT_OK(output->Close()); + blob_metadata = blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path) + .GetProperties() + .Value.Metadata; + // Defaults are overwritten and not merged. + EXPECT_EQ(Azure::Core::CaseInsensitiveMap{std::make_pair("bar", "foo")}, blob_metadata); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamSmall) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const std::string_view expected(kLoremIpsum); + ASSERT_OK(output->Write(expected)); + ASSERT_OK(output->Close()); + + // Verify we can read the object back. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::array inbuf{}; + ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); + + EXPECT_EQ(expected, std::string_view(inbuf.data(), size)); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamLarge) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + std::array sizes{257 * 1024, 258 * 1024, 259 * 1024}; + std::array buffers{ + std::string(sizes[0], 'A'), + std::string(sizes[1], 'B'), + std::string(sizes[2], 'C'), + }; + auto expected = std::int64_t{0}; + for (auto i = 0; i != 3; ++i) { + ASSERT_OK(output->Write(buffers[i])); + expected += sizes[i]; + ASSERT_EQ(expected, output->Tell()); + } + ASSERT_OK(output->Close()); + + // Verify we can read the object back. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::string contents; + std::shared_ptr buffer; + do { + ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024)); + ASSERT_TRUE(buffer); + contents.append(buffer->ToString()); + } while (buffer->size() != 0); + + EXPECT_EQ(contents, buffers[0] + buffers[1] + buffers[2]); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamTruncatesExistingFile) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const std::string_view expected0("Existing blob content"); + ASSERT_OK(output->Write(expected0)); + ASSERT_OK(output->Close()); + + // Check that the initial content has been written - if not this test is not achieving + // what it's meant to. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::array inbuf{}; + ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(expected0, std::string_view(inbuf.data(), size)); + + ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(path, {})); + const std::string_view expected1(kLoremIpsum); + ASSERT_OK(output->Write(expected1)); + ASSERT_OK(output->Close()); + + // Verify that the initial content has been overwritten. + ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(expected1, std::string_view(inbuf.data(), size)); +} + +TEST_F(AzuriteFileSystemTest, OpenAppendStreamDoesNotTruncateExistingFile) { + const auto path = PreexistingContainerPath() + "test-write-object"; + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + const std::string_view expected0("Existing blob content"); + ASSERT_OK(output->Write(expected0)); + ASSERT_OK(output->Close()); + + // Check that the initial content has been written - if not this test is not achieving + // what it's meant to. + ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + + std::array inbuf{}; + ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(expected0, std::string_view(inbuf.data())); + + ASSERT_OK_AND_ASSIGN(output, fs_->OpenAppendStream(path, {})); + const std::string_view expected1(kLoremIpsum); + ASSERT_OK(output->Write(expected1)); + ASSERT_OK(output->Close()); + + // Verify that the initial content has not been overwritten and that the block from + // the other client was not committed. + ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); + EXPECT_EQ(std::string(inbuf.data(), size), + std::string(expected0) + std::string(expected1)); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamClosed) { + const auto path = internal::ConcatAbstractPath(PreexistingContainerName(), + "open-output-stream-closed.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK(output->Close()); + ASSERT_RAISES(Invalid, output->Write(kLoremIpsum, std::strlen(kLoremIpsum))); + ASSERT_RAISES(Invalid, output->Flush()); + ASSERT_RAISES(Invalid, output->Tell()); +} + +TEST_F(AzuriteFileSystemTest, OpenOutputStreamUri) { + const auto path = internal::ConcatAbstractPath(PreexistingContainerName(), + "open-output-stream-uri.txt"); + ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + path)); +} + TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 9ecc4610f3864..810e9c179b156 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -654,8 +654,7 @@ Status CopyFiles(const std::shared_ptr& source_fs, "', which is outside base dir '", source_sel.base_dir, "'"); } - auto destination_path = - internal::ConcatAbstractPath(destination_base_dir, std::string(*relative)); + auto destination_path = internal::ConcatAbstractPath(destination_base_dir, *relative); if (source_info.IsDirectory()) { dirs.push_back(destination_path); diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index e030014159cf4..d440629a02496 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -304,7 +304,7 @@ namespace { /// Workhorse for streaming async implementation of `GetFileInfo` /// (`GetFileInfoGenerator`). /// -/// There are two variants of async discovery functions suported: +/// There are two variants of async discovery functions supported: /// 1. `DiscoverDirectoryFiles`, which parallelizes traversal of individual directories /// so that each directory results are yielded as a separate `FileInfoGenerator` via /// an underlying `DiscoveryImplIterator`, which delivers items in chunks (default size diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 46ea436a9f31a..9c895ae76c7b8 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -52,7 +52,7 @@ std::vector SplitAbstractPath(const std::string& path, char sep) { } auto append_part = [&parts, &v](size_t start, size_t end) { - parts.push_back(std::string(v.substr(start, end - start))); + parts.emplace_back(v.substr(start, end - start)); }; size_t start = 0; @@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int offset, int length, char return ""; } std::vector components = SplitAbstractPath(s, sep); - std::stringstream combined; if (offset >= static_cast(components.size())) { return ""; } - int end = offset + length; - if (end > static_cast(components.size())) { - end = static_cast(components.size()); - } - for (int i = offset; i < end; i++) { + const auto end = std::min(static_cast(offset) + length, components.size()); + std::stringstream combined; + for (auto i = static_cast(offset); i < end; i++) { combined << components[i]; if (i < end - 1) { combined << sep; @@ -140,16 +137,20 @@ Status ValidateAbstractPathParts(const std::vector& parts) { return Status::OK(); } -std::string ConcatAbstractPath(const std::string& base, const std::string& stem) { +std::string ConcatAbstractPath(std::string_view base, std::string_view stem) { DCHECK(!stem.empty()); if (base.empty()) { - return stem; + return std::string{stem}; } - return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem)); + std::string result; + result.reserve(base.length() + stem.length() + 1); // extra 1 is for potential kSep + result += EnsureTrailingSlash(base); + result += RemoveLeadingSlash(stem); + return result; } std::string EnsureTrailingSlash(std::string_view v) { - if (v.length() > 0 && v.back() != kSep) { + if (!v.empty() && !HasTrailingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "C:/"... // Unless the local filesystem always uses absolute paths return std::string(v) + kSep; @@ -159,7 +160,7 @@ std::string EnsureTrailingSlash(std::string_view v) { } std::string EnsureLeadingSlash(std::string_view v) { - if (v.length() == 0 || v.front() != kSep) { + if (!HasLeadingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "/C:"... return kSep + std::string(v); } else { @@ -197,10 +198,6 @@ Status AssertNoTrailingSlash(std::string_view key) { return Status::OK(); } -bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } - -bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } - Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { if (base.empty() || base.front() != kSep) { @@ -383,7 +380,7 @@ struct Globber::Impl { Globber::Globber(std::string pattern) : impl_(new Impl(pattern)) {} -Globber::~Globber() {} +Globber::~Globber() = default; bool Globber::Matches(const std::string& path) { return regex_match(path, impl_->pattern_); diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 2c8c123e779f4..1da7afd3f9381 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -69,7 +69,7 @@ Status ValidateAbstractPathParts(const std::vector& parts); // Append a non-empty stem to an abstract path. ARROW_EXPORT -std::string ConcatAbstractPath(const std::string& base, const std::string& stem); +std::string ConcatAbstractPath(std::string_view base, std::string_view stem); // Make path relative to base, if it starts with base. Otherwise error out. ARROW_EXPORT @@ -94,11 +94,13 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); -ARROW_EXPORT -bool HasTrailingSlash(std::string_view s); +inline bool HasTrailingSlash(std::string_view s) { + return !s.empty() && s.back() == kSep; +} -ARROW_EXPORT -bool HasLeadingSlash(std::string_view s); +inline bool HasLeadingSlash(std::string_view s) { + return !s.empty() && s.front() == kSep; +} ARROW_EXPORT bool IsAncestorOf(std::string_view ancestor, std::string_view descendant); diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 26a1530660781..62bec9b23b56f 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -951,7 +951,8 @@ class ClientBuilder { client_config_.caPath = ToAwsString(internal::global_options.tls_ca_dir_path); } - const bool use_virtual_addressing = options_.endpoint_override.empty(); + const bool use_virtual_addressing = + options_.endpoint_override.empty() || options_.force_virtual_addressing; // Set proxy options if provided if (!options_.proxy_options.scheme.empty()) { @@ -1042,7 +1043,7 @@ class RegionResolver { lock.unlock(); ARROW_ASSIGN_OR_RAISE(auto region, ResolveRegionUncached(bucket)); lock.lock(); - // Note we don't cache a non-existent bucket, as the bucket could be created later + // Note we don't cache a nonexistent bucket, as the bucket could be created later cache_[bucket] = region; return region; } @@ -1546,7 +1547,7 @@ class ObjectOutputStream final : public io::OutputStream { nbytes -= offset; }; - // Handle case where we have some bytes bufferred from prior calls. + // Handle case where we have some bytes buffered from prior calls. if (current_part_size_ > 0) { // Try to fill current buffer const int64_t to_copy = std::min(nbytes, kPartUploadSize - current_part_size_); @@ -2408,7 +2409,16 @@ class S3FileSystem::Impl : public std::enable_shared_from_this file_paths; for (const auto& file_info : file_infos) { DCHECK_GT(file_info.path().size(), bucket.size()); - file_paths.push_back(file_info.path().substr(bucket.size() + 1)); + auto file_path = file_info.path().substr(bucket.size() + 1); + if (file_info.IsDirectory()) { + // The selector returns FileInfo objects for directories with a + // a path that never ends in a trailing slash, but for AWS the file + // needs to have a trailing slash to recognize it as directory + // (https://github.com/apache/arrow/issues/38618) + DCHECK_OK(internal::AssertNoTrailingSlash(file_path)); + file_path = file_path + kSep; + } + file_paths.push_back(std::move(file_path)); } scheduler->AddSimpleTask( [=, file_paths = std::move(file_paths)] { @@ -3007,7 +3017,7 @@ S3GlobalOptions S3GlobalOptions::Defaults() { auto result = arrow::internal::GetEnvVar("ARROW_S3_LOG_LEVEL"); if (result.ok()) { - // Extract, trim, and downcase the value of the enivronment variable + // Extract, trim, and downcase the value of the environment variable auto value = arrow::internal::AsciiToLower(arrow::internal::TrimString(result.ValueUnsafe())); diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 9900a9a1c0aa5..13a0abde32318 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -130,7 +130,7 @@ struct ARROW_EXPORT S3Options { std::string role_arn; /// Optional identifier for an assumed role session. std::string session_name; - /// Optional external idenitifer to pass to STS when assuming a role + /// Optional external identifier to pass to STS when assuming a role std::string external_id; /// Frequency (in seconds) to refresh temporary credentials from assumed role int load_frequency = 900; @@ -144,6 +144,14 @@ struct ARROW_EXPORT S3Options { /// Type of credentials being used. Set along with credentials_provider. S3CredentialsKind credentials_kind = S3CredentialsKind::Default; + /// Whether to use virtual addressing of buckets + /// + /// If true, then virtual addressing is always enabled. + /// If false, then virtual addressing is only enabled if `endpoint_override` is empty. + /// + /// This can be used for non-AWS backends that only support virtual hosted-style access. + bool force_virtual_addressing = false; + /// Whether OutputStream writes will be issued in the background, without blocking. bool background_writes = true; @@ -185,7 +193,7 @@ struct ARROW_EXPORT S3Options { const std::string& external_id = "", int load_frequency = 900, const std::shared_ptr& stsClient = NULLPTR); - /// Configure with credentials from role assumed using a web identitiy token + /// Configure with credentials from role assumed using a web identity token void ConfigureAssumeRoleWithWebIdentityCredentials(); std::string GetAccessKey() const; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index b789845bd1aac..487a6abb18903 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -365,10 +365,10 @@ TEST_F(S3RegionResolutionTest, RestrictedBucket) { } TEST_F(S3RegionResolutionTest, NonExistentBucket) { - auto maybe_region = ResolveS3BucketRegion("ursa-labs-non-existent-bucket"); + auto maybe_region = ResolveS3BucketRegion("ursa-labs-nonexistent-bucket"); ASSERT_RAISES(IOError, maybe_region); ASSERT_THAT(maybe_region.status().message(), - ::testing::HasSubstr("Bucket 'ursa-labs-non-existent-bucket' not found")); + ::testing::HasSubstr("Bucket 'ursa-labs-nonexistent-bucket' not found")); } TEST_F(S3RegionResolutionTest, InvalidBucketName) { @@ -645,13 +645,13 @@ TEST_F(TestS3FS, GetFileInfoObject) { // Nonexistent AssertFileInfo(fs_.get(), "bucket/emptyd", FileType::NotFound); AssertFileInfo(fs_.get(), "bucket/somed", FileType::NotFound); - AssertFileInfo(fs_.get(), "non-existent-bucket/somed", FileType::NotFound); + AssertFileInfo(fs_.get(), "nonexistent-bucket/somed", FileType::NotFound); // Trailing slashes AssertFileInfo(fs_.get(), "bucket/emptydir/", FileType::Directory, kNoSize); AssertFileInfo(fs_.get(), "bucket/somefile/", FileType::File, 9); AssertFileInfo(fs_.get(), "bucket/emptyd/", FileType::NotFound); - AssertFileInfo(fs_.get(), "non-existent-bucket/somed/", FileType::NotFound); + AssertFileInfo(fs_.get(), "nonexistent-bucket/somed/", FileType::NotFound); // URIs ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:bucket/emptydir")); @@ -1057,7 +1057,7 @@ TEST_F(TestS3FS, Move) { ASSERT_OK(fs_->Move("bucket/a=2/newfile", "bucket/a=3/newfile")); // Nonexistent - ASSERT_RAISES(IOError, fs_->Move("bucket/non-existent", "bucket/newfile2")); + ASSERT_RAISES(IOError, fs_->Move("bucket/nonexistent", "bucket/newfile2")); ASSERT_RAISES(IOError, fs_->Move("nonexistent-bucket/somefile", "bucket/newfile2")); ASSERT_RAISES(IOError, fs_->Move("bucket/somefile", "nonexistent-bucket/newfile2")); AssertFileInfo(fs_.get(), "bucket/newfile2", FileType::NotFound); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 6c5dda8e659df..040917dcd218a 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -126,6 +126,12 @@ void SortInfos(std::vector* infos) { std::sort(infos->begin(), infos->end(), FileInfo::ByPath{}); } +std::vector SortedInfos(const std::vector& infos) { + auto sorted = infos; + SortInfos(&sorted); + return sorted; +} + void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos) { auto fut = CollectAsyncGenerator(gen); ASSERT_FINISHES_OK_AND_ASSIGN(auto nested_infos, fut); diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index 8156721b8537c..62b488e159a24 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -74,6 +74,10 @@ void CreateFile(FileSystem* fs, const std::string& path, const std::string& data ARROW_TESTING_EXPORT void SortInfos(FileInfoVector* infos); +// Create a copy of a FileInfo vector sorted by lexicographic path order +ARROW_TESTING_EXPORT +FileInfoVector SortedInfos(const FileInfoVector& infos); + ARROW_TESTING_EXPORT void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos); @@ -170,7 +174,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest { virtual bool allow_move_dir_over_non_empty_dir() const { return false; } // - Whether the filesystem allows appending to a file virtual bool allow_append_to_file() const { return true; } - // - Whether the filesystem allows appending to a new (not existent yet) file + // - Whether the filesystem allows appending to a nonexistent file virtual bool allow_append_to_new_file() const { return true; } // - Whether the filesystem supports directory modification times virtual bool have_directory_mtimes() const { return true; } diff --git a/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in b/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in index f072b2603e375..3c043b05a6bd5 100644 --- a/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in +++ b/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in @@ -32,7 +32,7 @@ find_dependency(ArrowTesting) include("${CMAKE_CURRENT_LIST_DIR}/ArrowFlightTestingTargets.cmake") -arrow_keep_backward_compatibility(ArrowFlightTetsing arrow_flight_testing) +arrow_keep_backward_compatibility(ArrowFlightTesting arrow_flight_testing) check_required_components(ArrowFlightTesting) diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index e26a821359781..1df71d2029f74 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -299,7 +299,7 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// \brief Request and poll a long running query /// \param[in] options Per-RPC options /// \param[in] descriptor the dataset request or a descriptor returned by a - /// prioir PollFlightInfo call + /// prior PollFlightInfo call /// \return Arrow result with the PollInfo describing the status of /// the requested query arrow::Result> PollFlightInfo( diff --git a/cpp/src/arrow/flight/cookie_internal.h b/cpp/src/arrow/flight/cookie_internal.h index 84647a1c94ca3..62c0390c585b3 100644 --- a/cpp/src/arrow/flight/cookie_internal.h +++ b/cpp/src/arrow/flight/cookie_internal.h @@ -35,14 +35,14 @@ namespace flight { namespace internal { /// \brief Case insensitive comparator for use by cookie caching map. Cookies are not -/// case sensitive. +/// case-sensitive. class ARROW_FLIGHT_EXPORT CaseInsensitiveComparator { public: bool operator()(const std::string& t1, const std::string& t2) const; }; /// \brief Case insensitive hasher for use by cookie caching map. Cookies are not -/// case sensitive. +/// case-sensitive. class ARROW_FLIGHT_EXPORT CaseInsensitiveHash { public: size_t operator()(const std::string& key) const; diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index 5feb310fc14a2..522973bec7231 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -562,7 +562,7 @@ class TestCookieParsing : public ::testing::Test { EXPECT_EQ(cookie_as_string, cookie.AsCookieString()); } - void VerifyCookieDateConverson(std::string date, const std::string& converted_date) { + void VerifyCookieDateConversion(std::string date, const std::string& converted_date) { internal::Cookie::ConvertCookieDate(&date); EXPECT_EQ(converted_date, date); } @@ -646,21 +646,21 @@ TEST_F(TestCookieParsing, ToString) { } TEST_F(TestCookieParsing, DateConversion) { - VerifyCookieDateConverson("Mon, 01 jan 2038 22:15:36 GMT;", "01 01 2038 22:15:36"); - VerifyCookieDateConverson("TUE, 10 Feb 2038 22:15:36 GMT", "10 02 2038 22:15:36"); - VerifyCookieDateConverson("WED, 20 MAr 2038 22:15:36 GMT;", "20 03 2038 22:15:36"); - VerifyCookieDateConverson("thu, 15 APR 2038 22:15:36 GMT", "15 04 2038 22:15:36"); - VerifyCookieDateConverson("Fri, 30 mAY 2038 22:15:36 GMT;", "30 05 2038 22:15:36"); - VerifyCookieDateConverson("Sat, 03 juN 2038 22:15:36 GMT", "03 06 2038 22:15:36"); - VerifyCookieDateConverson("Sun, 01 JuL 2038 22:15:36 GMT;", "01 07 2038 22:15:36"); - VerifyCookieDateConverson("Fri, 06 aUg 2038 22:15:36 GMT", "06 08 2038 22:15:36"); - VerifyCookieDateConverson("Fri, 01 SEP 2038 22:15:36 GMT;", "01 09 2038 22:15:36"); - VerifyCookieDateConverson("Fri, 01 OCT 2038 22:15:36 GMT", "01 10 2038 22:15:36"); - VerifyCookieDateConverson("Fri, 01 Nov 2038 22:15:36 GMT;", "01 11 2038 22:15:36"); - VerifyCookieDateConverson("Fri, 01 deC 2038 22:15:36 GMT", "01 12 2038 22:15:36"); - VerifyCookieDateConverson("", ""); - VerifyCookieDateConverson("Fri, 01 INVALID 2038 22:15:36 GMT;", - "01 INVALID 2038 22:15:36"); + VerifyCookieDateConversion("Mon, 01 jan 2038 22:15:36 GMT;", "01 01 2038 22:15:36"); + VerifyCookieDateConversion("TUE, 10 Feb 2038 22:15:36 GMT", "10 02 2038 22:15:36"); + VerifyCookieDateConversion("WED, 20 MAr 2038 22:15:36 GMT;", "20 03 2038 22:15:36"); + VerifyCookieDateConversion("thu, 15 APR 2038 22:15:36 GMT", "15 04 2038 22:15:36"); + VerifyCookieDateConversion("Fri, 30 mAY 2038 22:15:36 GMT;", "30 05 2038 22:15:36"); + VerifyCookieDateConversion("Sat, 03 juN 2038 22:15:36 GMT", "03 06 2038 22:15:36"); + VerifyCookieDateConversion("Sun, 01 JuL 2038 22:15:36 GMT;", "01 07 2038 22:15:36"); + VerifyCookieDateConversion("Fri, 06 aUg 2038 22:15:36 GMT", "06 08 2038 22:15:36"); + VerifyCookieDateConversion("Fri, 01 SEP 2038 22:15:36 GMT;", "01 09 2038 22:15:36"); + VerifyCookieDateConversion("Fri, 01 OCT 2038 22:15:36 GMT", "01 10 2038 22:15:36"); + VerifyCookieDateConversion("Fri, 01 Nov 2038 22:15:36 GMT;", "01 11 2038 22:15:36"); + VerifyCookieDateConversion("Fri, 01 deC 2038 22:15:36 GMT", "01 12 2038 22:15:36"); + VerifyCookieDateConversion("", ""); + VerifyCookieDateConversion("Fri, 01 INVALID 2038 22:15:36 GMT;", + "01 INVALID 2038 22:15:36"); } TEST_F(TestCookieParsing, ParseCookieAttribute) { diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc index 020fb7b24efc3..55cc938870f85 100644 --- a/cpp/src/arrow/flight/flight_test.cc +++ b/cpp/src/arrow/flight/flight_test.cc @@ -453,7 +453,7 @@ class TestTls : public ::testing::Test { // get initialized. // https://github.com/grpc/grpc/issues/13856 // https://github.com/grpc/grpc/issues/20311 - // In general, gRPC on MacOS struggles with TLS (both in the sense + // In general, gRPC on macOS struggles with TLS (both in the sense // of thread-locals and encryption) grpc_init(); diff --git a/cpp/src/arrow/flight/integration_tests/test_integration_server.cc b/cpp/src/arrow/flight/integration_tests/test_integration_server.cc index 6f31b82d1a9f4..b301955db8f58 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration_server.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration_server.cc @@ -40,7 +40,7 @@ #include "arrow/flight/test_util.h" DEFINE_int32(port, 31337, "Server port to listen on"); -DEFINE_string(scenario, "", "Integration test senario to run"); +DEFINE_string(scenario, "", "Integration test scenario to run"); namespace arrow { namespace flight { diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 6eba90c53a754..ffcffe12e3c78 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -226,7 +226,7 @@ class ARROW_FLIGHT_EXPORT FlightServerBase { /// \brief Shut down the server, blocking until current requests finish. /// /// Can be called from a signal handler or another thread while Serve() - /// blocks. Optionally a deadline can be set. Once the the deadline expires + /// blocks. Optionally a deadline can be set. Once the deadline expires /// server will wait until remaining running calls complete. /// /// Should only be called once. @@ -262,7 +262,7 @@ class ARROW_FLIGHT_EXPORT FlightServerBase { /// \brief Retrieve the current status of the target query /// \param[in] context The call context. /// \param[in] request the dataset request or a descriptor returned by a - /// prioir PollFlightInfo call + /// prior PollFlightInfo call /// \param[out] info the returned retry info provider /// \return Status virtual Status PollFlightInfo(const ServerCallContext& context, diff --git a/cpp/src/arrow/flight/sql/example/sqlite_server.cc b/cpp/src/arrow/flight/sql/example/sqlite_server.cc index 5e1043713295f..20b234e90ad3b 100644 --- a/cpp/src/arrow/flight/sql/example/sqlite_server.cc +++ b/cpp/src/arrow/flight/sql/example/sqlite_server.cc @@ -598,7 +598,7 @@ class SQLiteFlightSqlServer::Impl { const ServerCallContext& context, const GetPrimaryKeys& command) { std::stringstream table_query; - // The field key_name can not be recovered by the sqlite, so it is being set + // The field key_name cannot be recovered by the sqlite, so it is being set // to null following the same pattern for catalog_name and schema_name. table_query << "SELECT null as catalog_name, null as schema_name, table_name, " "name as column_name, pk as key_sequence, null as key_name\n" diff --git a/cpp/src/arrow/flight/sql/example/sqlite_statement.cc b/cpp/src/arrow/flight/sql/example/sqlite_statement.cc index 2363925660028..0305a1fa6b475 100644 --- a/cpp/src/arrow/flight/sql/example/sqlite_statement.cc +++ b/cpp/src/arrow/flight/sql/example/sqlite_statement.cc @@ -130,7 +130,7 @@ arrow::Result> SqliteStatement::GetSchema() const { if (column_decltype != NULLPTR) { ARROW_ASSIGN_OR_RAISE(data_type, GetArrowType(column_decltype)); } else { - // If it can not determine the actual column type, return a dense_union type + // If it cannot determine the actual column type, return a dense_union type // covering any type SQLite supports. data_type = GetUnknownColumnDataType(); } diff --git a/cpp/src/arrow/flight/sql/example/sqlite_type_info.h b/cpp/src/arrow/flight/sql/example/sqlite_type_info.h index a104626c0f4eb..f26ddc31e7f37 100644 --- a/cpp/src/arrow/flight/sql/example/sqlite_type_info.h +++ b/cpp/src/arrow/flight/sql/example/sqlite_type_info.h @@ -24,11 +24,11 @@ namespace flight { namespace sql { namespace example { -/// \brief Gets the harded-coded type info from Sqlite for all data types. +/// \brief Gets the hard-coded type info from Sqlite for all data types. /// \return A record batch. arrow::Result> DoGetTypeInfoResult(); -/// \brief Gets the harded-coded type info from Sqlite filtering +/// \brief Gets the hard-coded type info from Sqlite filtering /// for a specific data type. /// \return A record batch. arrow::Result> DoGetTypeInfoResult(int data_type_filter); diff --git a/cpp/src/arrow/flight/sql/server.h b/cpp/src/arrow/flight/sql/server.h index 360677c078c81..24f0aa2bd48cf 100644 --- a/cpp/src/arrow/flight/sql/server.h +++ b/cpp/src/arrow/flight/sql/server.h @@ -590,7 +590,7 @@ class ARROW_FLIGHT_SQL_EXPORT FlightSqlServerBase : public FlightServerBase { /// \brief Commit/rollback a transaction. /// \param[in] context The call context. - /// \param[in] request The tranaction. + /// \param[in] request The transaction. virtual Status EndTransaction(const ServerCallContext& context, const ActionEndTransactionRequest& request); diff --git a/cpp/src/arrow/flight/sql/types.h b/cpp/src/arrow/flight/sql/types.h index 293b1d5579ec0..b41488b68f232 100644 --- a/cpp/src/arrow/flight/sql/types.h +++ b/cpp/src/arrow/flight/sql/types.h @@ -535,7 +535,7 @@ struct ARROW_FLIGHT_SQL_EXPORT SqlInfoOptions { /// allowed for a column name. SQL_MAX_COLUMN_NAME_LENGTH = 543, - /// Retrieves a int64 value representing the the maximum number of columns + /// Retrieves a int64 value representing the maximum number of columns /// allowed in a GROUP BY clause. SQL_MAX_COLUMNS_IN_GROUP_BY = 544, @@ -846,7 +846,7 @@ struct ARROW_FLIGHT_SQL_EXPORT SqlInfoOptions { /// The level of support for Flight SQL transaction RPCs. enum SqlSupportedTransaction { - /// Unknown/not indicated/no supoprt + /// Unknown/not indicated/no support SQL_SUPPORTED_TRANSACTION_NONE = 0, /// Transactions, but not savepoints. SQL_SUPPORTED_TRANSACTION_TRANSACTION = 1, diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 40a0787d14a7a..2342c758273a3 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -575,7 +575,7 @@ struct ARROW_FLIGHT_EXPORT SchemaResult { std::string raw_schema_; }; -/// \brief The access coordinates for retireval of a dataset, returned by +/// \brief The access coordinates for retrieval of a dataset, returned by /// GetFlightInfo class ARROW_FLIGHT_EXPORT FlightInfo { public: @@ -604,7 +604,7 @@ class ARROW_FLIGHT_EXPORT FlightInfo { /// bookkeeping /// \param[in,out] dictionary_memo for dictionary bookkeeping, will /// be modified - /// \return Arrrow result with the reconstructed Schema + /// \return Arrow result with the reconstructed Schema arrow::Result> GetSchema( ipc::DictionaryMemo* dictionary_memo) const; diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index e023e6a3a44d3..9b56928c68843 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -793,8 +793,6 @@ void CheckPrimitive(const std::shared_ptr& type, } TEST(TestJsonSchemaWriter, FlatTypes) { - // TODO - // field("f14", date32()) std::vector> fields = { field("f0", int8()), field("f1", int16(), false), @@ -822,6 +820,8 @@ TEST(TestJsonSchemaWriter, FlatTypes) { field("f21", run_end_encoded(int16(), utf8())), field("f22", run_end_encoded(int32(), utf8())), field("f23", run_end_encoded(int64(), utf8())), + field("f24", list_view(int32())), + field("f25", large_list_view(uint8())), }; auto schema = ::arrow::schema(fields); @@ -1147,10 +1147,12 @@ TEST_P(TestJsonRoundTrip, RoundTrip) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 59749c36a958e..64eb342d5bd47 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -236,7 +236,7 @@ class SchemaWriter { enable_if_t::value || is_primitive_ctype::value || is_base_binary_type::value || is_binary_view_like_type::value || is_var_length_list_type::value || is_struct_type::value || - is_run_end_encoded_type::value> + is_run_end_encoded_type::value || is_list_view_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -422,6 +422,16 @@ class SchemaWriter { return Status::OK(); } + Status Visit(const ListViewType& type) { + WriteName("listview", type); + return Status::OK(); + } + + Status Visit(const LargeListViewType& type) { + WriteName("largelistview", type); + return Status::OK(); + } + Status Visit(const MapType& type) { WriteName("map", type); return Status::OK(); @@ -777,6 +787,15 @@ class ArrayWriter { return WriteChildren(array.type()->fields(), {array.values()}); } + template + enable_if_list_view Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length()); + WriteIntegerField("SIZE", array.raw_value_sizes(), array.length()); + return WriteChildren(array.type()->fields(), {array.values()}); + } + Status Visit(const FixedSizeListArray& array) { WriteValidityField(array); const auto& type = checked_cast(*array.type()); @@ -1132,6 +1151,16 @@ Result> GetType(const RjObject& json_type, return Status::Invalid("Large list must have exactly one child"); } return large_list(children[0]); + } else if (type_name == "listview") { + if (children.size() != 1) { + return Status::Invalid("List-view must have exactly one child"); + } + return list_view(children[0]); + } else if (type_name == "largelistview") { + if (children.size() != 1) { + return Status::Invalid("Large list-view must have exactly one child"); + } + return large_list_view(children[0]); } else if (type_name == "map") { return GetMap(json_type, children); } else if (type_name == "fixedsizelist") { @@ -1651,6 +1680,26 @@ class ArrayReader { return CreateList(type_); } + template + Status CreateListView(const std::shared_ptr& type) { + using offset_type = typename T::offset_type; + + RETURN_NOT_OK(InitializeData(3)); + + RETURN_NOT_OK(GetNullBitmap()); + ARROW_ASSIGN_OR_RAISE(const auto json_offsets, GetMemberArray(obj_, "OFFSET")); + RETURN_NOT_OK(GetIntArray(json_offsets, length_, &data_->buffers[1])); + ARROW_ASSIGN_OR_RAISE(const auto json_sizes, GetMemberArray(obj_, "SIZE")); + RETURN_NOT_OK(GetIntArray(json_sizes, length_, &data_->buffers[2])); + RETURN_NOT_OK(GetChildren(obj_, *type)); + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& type) { + return CreateListView(type_); + } + Status Visit(const MapType& type) { auto list_type = std::make_shared(type.value_field()); RETURN_NOT_OK(CreateList(list_type)); diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index 72977f0f297f5..6c484242a4fc8 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -279,6 +279,8 @@ class CompressedInputStream::Impl { // Decompress some data from the compressed_ buffer. // Call this function only if the decompressed_ buffer is empty. Status DecompressData() { + DCHECK_NE(compressed_->data(), nullptr); + int64_t decompress_size = kDecompressSize; while (true) { @@ -329,7 +331,7 @@ class CompressedInputStream::Impl { // Try to feed more data into the decompressed_ buffer. Status RefillDecompressed(bool* has_data) { // First try to read data from the decompressor - if (compressed_) { + if (compressed_ && compressed_->size() != 0) { if (decompressor_->IsFinished()) { // We just went over the end of a previous compressed stream. RETURN_NOT_OK(decompressor_->Reset()); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index d13e0714cbf5a..5b760a2b5a9cf 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -159,6 +159,8 @@ class ARROW_EXPORT BufferReader BufferReader(const uint8_t* data, int64_t size); /// \brief Instantiate from std::string_view. Does not own data + /// \deprecated Deprecated in 14.0.0. Use FromString or + /// BufferReader(std::shared_ptr buffer) instead. ARROW_DEPRECATED( "Deprecated in 14.0.0. Use FromString or BufferReader(std::shared_ptr " "buffer) instead.") diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 0b6ae4f620647..80e441fe2b670 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -329,9 +329,11 @@ namespace { const std::vector kBatchCases = { &ipc::test::MakeIntRecordBatch, &ipc::test::MakeListRecordBatch, + &ipc::test::MakeListViewRecordBatch, &ipc::test::MakeFixedSizeListRecordBatch, &ipc::test::MakeNonNullRecordBatch, &ipc::test::MakeDeeplyNestedList, + &ipc::test::MakeDeeplyNestedListView, &ipc::test::MakeStringTypesRecordBatchWithNulls, &ipc::test::MakeStruct, &ipc::test::MakeUnion, diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc index 50be10991ff9f..682c352132a11 100644 --- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc @@ -74,6 +74,8 @@ Result>> Batches() { batches.push_back(batch); RETURN_NOT_OK(test::MakeListRecordBatch(&batch)); batches.push_back(batch); + RETURN_NOT_OK(test::MakeListViewRecordBatch(&batch)); + batches.push_back(batch); RETURN_NOT_OK(test::MakeDictionary(&batch)); batches.push_back(batch); RETURN_NOT_OK(test::MakeTimestamps(&batch)); diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 4d2d803f3f65e..ceeabe01677ed 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -123,12 +123,16 @@ Status GetConverter(const std::shared_ptr&, std::shared_ptr template class ConcreteConverter : public Converter { public: - Status AppendValues(const rj::Value& json_array) override { - auto self = static_cast(this); - if (!json_array.IsArray()) { - return JSONTypeError("array", json_array.GetType()); + Result SizeOfJSONArray(const rj::Value& json_obj) { + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); } - auto size = json_array.Size(); + return json_obj.Size(); + } + + Status AppendValues(const rj::Value& json_array) final { + auto self = static_cast(this); + ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); for (uint32_t i = 0; i < size; ++i) { RETURN_NOT_OK(self->AppendValue(json_array[i])); } @@ -536,15 +540,19 @@ class FixedSizeBinaryConverter final // Converter for list arrays template -class ListConverter final : public ConcreteConverter> { +class VarLengthListLikeConverter final + : public ConcreteConverter> { public: using BuilderType = typename TypeTraits::BuilderType; - explicit ListConverter(const std::shared_ptr& type) { this->type_ = type; } + explicit VarLengthListLikeConverter(const std::shared_ptr& type) { + this->type_ = type; + } Status Init() override { - const auto& list_type = checked_cast(*this->type_); - RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + const auto& var_length_list_like_type = checked_cast(*this->type_); + RETURN_NOT_OK( + GetConverter(var_length_list_like_type.value_type(), &child_converter_)); auto child_builder = child_converter_->builder(); builder_ = std::make_shared(default_memory_pool(), child_builder, this->type_); @@ -555,8 +563,9 @@ class ListConverter final : public ConcreteConverter> { if (json_obj.IsNull()) { return this->AppendNull(); } - RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array + ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); + RETURN_NOT_OK(builder_->Append(true, size)); return child_converter_->AppendValues(json_obj); } @@ -898,8 +907,11 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LIST_VIEW, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST_VIEW, + VarLengthListLikeConverter) SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index b67c26999945b..ea3a9ae1a14a9 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -59,6 +59,9 @@ using ::arrow::internal::BytesToBits; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; +using ListAndListViewTypes = + ::testing::Types; + // Avoid undefined behaviour on signed overflow template Signed SafeSignedAdd(Signed u, Signed v) { @@ -591,145 +594,207 @@ TEST(TestDecimal, Dictionary) { } } -TEST(TestList, IntegerList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(int64()); - std::shared_ptr offsets, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - ArrayFromVector({}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 2, 2, 3}, &offsets); - ArrayFromVector({4, 5, 6}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); +template +class TestVarLengthListArray : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetType = typename TypeTraits::OffsetType; + + static constexpr bool is_list_view_type = is_list_view(TypeClass::type_id); + + void TestIntegerList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 0, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestList, IntegerListErrors) { - std::shared_ptr type = list(int64()); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); -} - -TEST(TestList, NullList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(null()); - std::shared_ptr offsets, values, expected, actual; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - values = std::make_shared(0); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + void TestIntegerListErrors() { + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr array; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - values = std::make_shared(3); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); + } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); + void TestNullList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(null()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} -TEST(TestList, IntegerListList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(list(uint8())); - std::shared_ptr offsets, values, nested, expected, actual; + void TestIntegerListList() { + auto pool = default_memory_pool(); + std::shared_ptr type = + std::make_shared(std::make_shared(uint8())); + std::shared_ptr offsets, sizes, values, nested, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({1, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 2, 3}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 0, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 3, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.Append(true, 0)); + ASSERT_OK(list_builder.Finish(&expected)); + } + } +}; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 2, 3}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 2); - AssertArraysEqual(*expected, *actual); +TYPED_TEST_SUITE(TestVarLengthListArray, ListAndListViewTypes); - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 0, 1, 4, 5}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 4); - AssertArraysEqual(*expected, *actual); +TYPED_TEST(TestVarLengthListArray, IntegerList) { this->TestIntegerList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto& child_builder = checked_cast(*list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.Append()); - ASSERT_OK(list_builder.Finish(&expected)); - } -} +TYPED_TEST(TestVarLengthListArray, IntegerListErrors) { this->TestIntegerListErrors(); } -TEST(TestLargeList, Basics) { - // Similar as TestList above, only testing the basics - auto pool = default_memory_pool(); - std::shared_ptr type = large_list(int16()); - std::shared_ptr offsets, values, expected, actual; +TYPED_TEST(TestVarLengthListArray, NullList) { this->TestNullList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, LargeListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); -} +TYPED_TEST(TestVarLengthListArray, IntegerListList) { this->TestIntegerListList(); } TEST(TestMap, IntegerToInteger) { auto type = map(int16(), int16()); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index ab1a58dd1df8b..4f41edf8e15db 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -361,6 +361,18 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, } *out = std::make_shared(children[0]); return Status::OK(); + case flatbuf::Type::ListView: + if (children.size() != 1) { + return Status::Invalid("ListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); + case flatbuf::Type::LargeListView: + if (children.size() != 1) { + return Status::Invalid("LargeListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); case flatbuf::Type::Map: if (children.size() != 1) { return Status::Invalid("Map must have exactly 1 child field"); @@ -669,6 +681,20 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const ListViewType& type) { + fb_type_ = flatbuf::Type::ListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); + } + + Status Visit(const LargeListViewType& type) { + fb_type_ = flatbuf::Type::LargeListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); + } + Status Visit(const MapType& type) { fb_type_ = flatbuf::Type::Map; RETURN_NOT_OK(VisitChildFields(type)); diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 313346b5deced..5c15cb912e4a7 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -376,10 +376,12 @@ TEST_F(TestSchemaMetadata, MetadataVersionForwardCompatibility) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, @@ -974,6 +976,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeListRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeListViewRecordBatch(&batch)); + TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeZeroLengthRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); @@ -982,6 +987,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeDeeplyNestedList(&batch)); TestGetRecordBatchSize(options_, batch); + + ASSERT_OK(MakeDeeplyNestedListView(&batch)); + TestGetRecordBatchSize(options_, batch); } class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 5dd01f2015dd7..d272c78560f82 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -254,7 +254,12 @@ class ArrayLoader { if (i >= static_cast(variadic_counts->size())) { return Status::IOError("variadic_count_index out of range."); } - return static_cast(variadic_counts->Get(i)); + int64_t count = variadic_counts->Get(i); + if (count < 0 || count > std::numeric_limits::max()) { + return Status::IOError( + "variadic_count must be representable as a positive int32_t, got ", count, "."); + } + return static_cast(count); } Status GetFieldMetadata(int field_index, ArrayData* out) { @@ -330,6 +335,22 @@ class ArrayLoader { return LoadChildren(type.fields()); } + template + Status LoadListView(const TYPE& type) { + out_->buffers.resize(3); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[2])); + + const int num_children = type.num_fields(); + if (num_children != 1) { + return Status::Invalid("Wrong number of children: ", num_children); + } + + return LoadChildren(type.fields()); + } + Status LoadChildren(const std::vector>& child_fields) { DCHECK_NE(out_, nullptr); ArrayData* parent = out_; @@ -372,10 +393,10 @@ class ArrayLoader { RETURN_NOT_OK(LoadCommon(type.id())); RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, GetVariadicCount(variadic_count_index_++)); - out_->buffers.resize(character_buffer_count + 2); - for (size_t i = 0; i < character_buffer_count; ++i) { + out_->buffers.resize(data_buffer_count + 2); + for (size_t i = 0; i < data_buffer_count; ++i) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); } return Status::OK(); @@ -392,6 +413,11 @@ class ArrayLoader { return LoadList(type); } + template + enable_if_list_view Visit(const T& type) { + return LoadListView(type); + } + Status Visit(const MapType& type) { RETURN_NOT_OK(LoadList(type)); return MapArray::ValidateChildData(out_->child_data); diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 6faaf96b332d4..87c02e2d87a1e 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -189,6 +189,32 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li return MakeListArray(child_array, num_lists, include_nulls, pool, out); } +Status MakeRandomListViewArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.ListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, /*coverage=*/0.9, + kDefaultBufferAlignment, pool); + return Status::OK(); +} + +Status MakeRandomLargeListViewArray(const std::shared_ptr& child_array, + int num_lists, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.LargeListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, + /*force_empty_nulls=*/0.9, kDefaultBufferAlignment, pool); + return Status::OK(); +} + Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { @@ -418,6 +444,31 @@ Status MakeListRecordBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeListViewRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = field("f0", list_view(int32())); + auto f1 = field("f1", list_view(list_view(int32()))); + auto f2 = field("f2", large_list_view(int32())); + auto schema = ::arrow::schema({f0, f1, f2}); + + // Example data + + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomListViewArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListViewArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomLargeListViewArray(leaf_values, length, include_nulls, pool, + &large_list_array)); + *out = + RecordBatch::Make(schema, length, {list_array, list_list_array, large_list_array}); + return Status::OK(); +} + Status MakeFixedSizeListRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", fixed_size_list(int32(), 1)); @@ -505,6 +556,27 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { return Status::OK(); } +Status MakeDeeplyNestedListView(std::shared_ptr* out) { + const int batch_length = 5; + auto type = int32(); + + MemoryPool* pool = default_memory_pool(); + std::shared_ptr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(list_view(type)); + RETURN_NOT_OK( + MakeRandomListViewArray(array, batch_length, include_nulls, pool, &array)); + } + + auto f0 = field("f0", type); + auto schema = ::arrow::schema({f0}); + std::vector> arrays = {array}; + *out = RecordBatch::Make(schema, batch_length, arrays); + return Status::OK(); +} + Status MakeStruct(std::shared_ptr* out) { // reuse constructed list columns std::shared_ptr list_batch; diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index fc0c8ddbea319..db8613cbb1e6a 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -107,6 +107,9 @@ Status MakeNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeListRecordBatch(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeListViewRecordBatch(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); @@ -119,6 +122,9 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeDeeplyNestedList(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeDeeplyNestedListView(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeStruct(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 9668f459d0d31..93256440f4a7a 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -350,6 +350,67 @@ class RecordBatchSerializer { return Status::OK(); } + template + Status GetZeroBasedListViewOffsets(const ArrayType& array, + std::shared_ptr* out_value_offsets, + offset_type* out_min_offset, + offset_type* out_max_end) { + auto offsets = array.value_offsets(); + auto sizes = array.value_sizes(); + + const int64_t required_bytes = sizeof(offset_type) * array.length(); + if (array.offset() != 0) { + // If we have a non-zero offset, it's likely that the smallest offset is + // not zero. We must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly. + + ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, + AllocateBuffer(required_bytes, options_.memory_pool)); + offset_type min_offset = 0; + offset_type max_end = 0; + if (array.length() > 0) { + min_offset = std::numeric_limits::max(); + for (int i = 0; i < array.length(); ++i) { + min_offset = std::min(min_offset, array.value_offset(i)); + max_end = std::max(max_end, array.value_offset(i) + array.value_length(i)); + } + } + + auto* dest_offsets = shifted_offsets->mutable_data_as(); + + for (int i = 0; i < array.length(); ++i) { + dest_offsets[i] = array.value_offset(i) - min_offset; + } + *out_min_offset = min_offset; + *out_max_end = max_end; + offsets = std::move(shifted_offsets); + } else { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated + // slice + if (offsets != nullptr && offsets->size() > required_bytes) { + offsets = SliceBuffer(offsets, 0, required_bytes); + } + *out_min_offset = 0; + *out_max_end = static_cast(array.values()->length()); + } + *out_value_offsets = std::move(offsets); + return Status::OK(); + } + + template + Status GetListViewSizes(const ArrayType& array, + std::shared_ptr* out_value_sizes) { + const int64_t required_bytes = sizeof(offset_type) * array.length(); + auto sizes = array.value_sizes(); + if (sizes != nullptr && (array.offset() != 0 || sizes->size() > required_bytes)) { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated slice + auto offset_bytes = array.offset() * sizeof(offset_type); + sizes = SliceBuffer(sizes, offset_bytes, required_bytes); + } + *out_value_sizes = std::move(sizes); + return Status::OK(); + } + Status Visit(const BooleanArray& array) { std::shared_ptr data; RETURN_NOT_OK(GetTruncatedBitmap(array.offset(), array.length(), array.values(), @@ -428,7 +489,6 @@ class RecordBatchSerializer { RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); out_->body_buffers.emplace_back(value_offsets); - --max_recursion_depth_; std::shared_ptr values = array.values(); offset_type values_offset = 0; @@ -442,6 +502,37 @@ class RecordBatchSerializer { // Must also slice the values values = values->Slice(values_offset, values_length); } + --max_recursion_depth_; + RETURN_NOT_OK(VisitArray(*values)); + ++max_recursion_depth_; + return Status::OK(); + } + + template + enable_if_list_view Visit(const T& array) { + using offset_type = typename T::offset_type; + + offset_type min_offset = 0; + offset_type max_end = 0; + { + std::shared_ptr value_offsets; + RETURN_NOT_OK( + GetZeroBasedListViewOffsets(array, &value_offsets, &min_offset, &max_end)); + out_->body_buffers.push_back(std::move(value_offsets)); + } + { + std::shared_ptr value_sizes; + RETURN_NOT_OK(GetListViewSizes(array, &value_sizes)); + out_->body_buffers.push_back(std::move(value_sizes)); + } + + std::shared_ptr values = array.values(); + + if (min_offset != 0 || max_end < values->length()) { + // Must also slice the values + values = values->Slice(min_offset, max_end); + } + --max_recursion_depth_; RETURN_NOT_OK(VisitArray(*values)); ++max_recursion_depth_; return Status::OK(); diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index 185dcde355f0a..761ff02dd40cb 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -769,8 +769,8 @@ class HandlerBase : public BlockParser, rj::kParseNumbersAsStringsFlag; rj::Reader reader; - - for (; num_rows_ < kMaxParserNumRows; ++num_rows_) { + // ensure that the loop can exit when the block too large. + for (; num_rows_ < std::numeric_limits::max(); ++num_rows_) { auto ok = reader.Parse(json, handler); switch (ok.Code()) { case rj::kParseErrorNone: @@ -790,7 +790,7 @@ class HandlerBase : public BlockParser, return ParseError(rj::GetParseError_En(ok.Code()), " in row ", num_rows_); } } - return Status::Invalid("Exceeded maximum rows"); + return Status::Invalid("Row count overflowed int32_t"); } template diff --git a/cpp/src/arrow/json/parser.h b/cpp/src/arrow/json/parser.h index e21d09c4169d0..aca416dbb7b5b 100644 --- a/cpp/src/arrow/json/parser.h +++ b/cpp/src/arrow/json/parser.h @@ -56,8 +56,6 @@ struct Kind { static Status ForType(const DataType& type, Kind::type* kind); }; -constexpr int32_t kMaxParserNumRows = 100000; - /// \class BlockParser /// \brief A reusable block-based parser for JSON data /// diff --git a/cpp/src/arrow/json/parser_benchmark.cc b/cpp/src/arrow/json/parser_benchmark.cc index 2a1629ef8e230..a5a6eb68e67a7 100644 --- a/cpp/src/arrow/json/parser_benchmark.cc +++ b/cpp/src/arrow/json/parser_benchmark.cc @@ -200,8 +200,6 @@ static void ParseJSONFields(benchmark::State& state) { // NOLINT non-const refe int32_t num_rows = static_cast(2e4 / (1.0 - sparsity) / num_fields); // ... however, we want enough rows to make setup/finish overhead negligible num_rows = std::max(num_rows, 200); - // ... and also we want to avoid an "Exceeded maximum rows" error. - num_rows = std::min(num_rows, kMaxParserNumRows); // In the end, we will empirically generate between 400 kB and 4 MB of JSON data. auto fields = GenerateTestFields(num_fields, 10); diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index f7ab6fd10275f..2f819779bdb59 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -135,6 +135,10 @@ struct GenerateImpl { return OK(writer.EndArray(size)); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); } Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index b392e027a6b89..e666ec70f9489 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -249,7 +249,8 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_list_like WriteDataValues(const ArrayType& array) { + enable_if_t::value || is_list_view_type::value, Status> + WriteDataValues(const ArrayType& array) { const auto values = array.values(); const auto child_options = ChildOptions(); ArrayPrinter values_printer(child_options, sink_); @@ -300,6 +301,8 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value, Status> diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 9217e190d5b62..0db6ae4867299 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -774,8 +774,11 @@ TEST_F(TestPrettyPrint, BinaryNoNewlines) { CheckPrimitive(options, is_valid, values, expected, false); } -TEST_F(TestPrettyPrint, ListType) { - auto list_type = list(int64()); +template +void TestPrettyPrintVarLengthListLike() { + using LargeTypeClass = typename TypeTraits::LargeType; + auto var_list_type = std::make_shared(int64()); + auto var_large_list_type = std::make_shared(int64()); static const char* ex = R"expected([ [ @@ -836,7 +839,7 @@ TEST_F(TestPrettyPrint, ListType) { ] ])expected"; - auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + auto array = ArrayFromJSON(var_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); auto make_options = [](int indent, int window, int container_window) { auto options = PrettyPrintOptions(indent, window); options.container_window = container_window; @@ -850,8 +853,7 @@ TEST_F(TestPrettyPrint, ListType) { ex_3); CheckArray(*array, {0, 10}, ex_4); - list_type = large_list(int64()); - array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + array = ArrayFromJSON(var_large_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); CheckStream(*array, make_options(/*indent=*/0, /*window=*/10, /*container_window=*/5), ex); CheckStream(*array, make_options(/*indent=*/2, /*window=*/10, /*container_window=*/5), @@ -861,6 +863,93 @@ TEST_F(TestPrettyPrint, ListType) { CheckArray(*array, {0, 10}, ex_4); } +TEST_F(TestPrettyPrint, ListType) { TestPrettyPrintVarLengthListLike(); } + +template +void TestListViewSpecificPrettyPrinting() { + using ArrayType = typename TypeTraits::ArrayType; + using OffsetType = typename TypeTraits::OffsetType; + + auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + auto int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + auto int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + + auto Offsets = [](std::string_view json) { + return ArrayFromJSON(TypeTraits::type_singleton(), json); + }; + auto Sizes = Offsets; + + ASSERT_OK_AND_ASSIGN(auto int_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *int32_values)); + ASSERT_OK(int_list_view_array->ValidateFull()); + static const char* ex1 = + "[\n" + " [\n" + " 1,\n" + " 20\n" + " ],\n" + " [\n" + " 1\n" + " ],\n" + " [\n" + " 20\n" + " ],\n" + " [\n" + " 3\n" + " ]\n" + "]"; + CheckStream(*int_list_view_array, {}, ex1); + + ASSERT_OK_AND_ASSIGN(auto string_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + ASSERT_OK(string_list_view_array->ValidateFull()); + static const char* ex2 = + "[\n" + " [\n" + " \"Hello\",\n" + " \"World\"\n" + " ],\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ],\n" + " [\n" + " null\n" + " ]\n" + "]"; + CheckStream(*string_list_view_array, {}, ex2); + + auto sliced_array = string_list_view_array->Slice(1, 2); + static const char* ex3 = + "[\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ]\n" + "]"; + CheckStream(*sliced_array, {}, ex3); + + ASSERT_OK_AND_ASSIGN( + auto empty_array, + ArrayType::FromArrays(*Offsets("[]"), *Sizes("[]"), *int16_values)); + ASSERT_OK(empty_array->ValidateFull()); + static const char* ex4 = "[]"; + CheckStream(*empty_array, {}, ex4); +} + +TEST_F(TestPrettyPrint, ListViewType) { + TestPrettyPrintVarLengthListLike(); + + TestListViewSpecificPrettyPrinting(); + TestListViewSpecificPrettyPrinting(); +} + TEST_F(TestPrettyPrint, ListTypeNoNewlines) { auto list_type = list(int64()); auto empty_array = ArrayFromJSON(list_type, "[]"); diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 167e272705268..6996b46c8b61a 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -587,6 +587,12 @@ ListScalar::ListScalar(std::shared_ptr value, bool is_valid) LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, large_list(value->type()), is_valid) {} +ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, list_view(value->type()), is_valid) {} + +LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + inline std::shared_ptr MakeMapType(const std::shared_ptr& pair_type) { ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT); ARROW_CHECK_EQ(pair_type->num_fields(), 2); @@ -776,14 +782,6 @@ struct MakeNullImpl { return Status::OK(); } - template ::ScalarType> - Status VisitListLike(const T& type, int64_t value_size = 0) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, - MakeArrayOfNull(type.value_type(), value_size)); - out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); - return Status::OK(); - } - Status Visit(const FixedSizeBinaryType& type) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, AllocateBuffer(type.byte_width())); @@ -794,11 +792,25 @@ struct MakeNullImpl { return Status::OK(); } + template ::ScalarType> + Status VisitListLike(const T& type, int64_t list_size = 0) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, + MakeArrayOfNull(type.value_type(), list_size)); + out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); + return Status::OK(); + } + Status Visit(const ListType& type) { return VisitListLike(type); } + Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const MapType& type) { return VisitListLike(type); } - Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const ListViewType& type) { return VisitListLike(type); } + + Status Visit(const LargeListViewType& type) { + return VisitListLike(type); + } Status Visit(const FixedSizeListType& type) { return VisitListLike(type, type.list_size()); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 5175b0128524c..65c5ee4df0a04 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -531,6 +531,20 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar { explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); }; +struct ARROW_EXPORT ListViewScalar : public BaseListScalar { + using TypeClass = ListViewType; + using BaseListScalar::BaseListScalar; + + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + +struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar { + using TypeClass = LargeListViewType; + using BaseListScalar::BaseListScalar; + + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + struct ARROW_EXPORT MapScalar : public BaseListScalar { using TypeClass = MapType; using BaseListScalar::BaseListScalar; diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index a188aea1669a4..9d40e688f1dfb 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -30,6 +30,7 @@ #include "arrow/array.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/compute/cast.h" #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -40,6 +41,9 @@ namespace arrow { +using compute::Cast; +using compute::CastOptions; + using internal::checked_cast; using internal::checked_pointer_cast; @@ -394,6 +398,10 @@ class TestRealScalar : public ::testing::Test { void TestLargeListOf() { TestListOf(large_list(type_)); } + void TestListViewOf() { TestListOf(list_view(type_)); } + + void TestLargeListViewOf() { TestListOf(large_list_view(type_)); } + protected: std::shared_ptr type_; std::shared_ptr scalar_val_, scalar_other_, scalar_nan_, scalar_other_nan_, @@ -414,6 +422,10 @@ TYPED_TEST(TestRealScalar, ListOf) { this->TestListOf(); } TYPED_TEST(TestRealScalar, LargeListOf) { this->TestLargeListOf(); } +TYPED_TEST(TestRealScalar, ListViewOf) { this->TestListViewOf(); } + +TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); } + template class TestDecimalScalar : public ::testing::Test { public: @@ -854,9 +866,9 @@ TEST(TestTimestampScalars, MakeScalar) { TEST(TestTimestampScalars, Cast) { auto convert = [](TimeUnit::type in, TimeUnit::type out, int64_t value) -> int64_t { - auto scalar = - TimestampScalar(value, timestamp(in)).CastTo(timestamp(out)).ValueOrDie(); - return internal::checked_pointer_cast(scalar)->value; + EXPECT_OK_AND_ASSIGN(auto casted, Cast(TimestampScalar(value, timestamp(in)), + timestamp(out), CastOptions::Unsafe())); + return internal::checked_pointer_cast(casted.scalar())->value; }; EXPECT_EQ(convert(TimeUnit::SECOND, TimeUnit::MILLI, 1), 1000); @@ -1031,22 +1043,22 @@ TYPED_TEST(TestNumericScalar, Cast) { std::shared_ptr other_scalar; ASSERT_OK_AND_ASSIGN(other_scalar, Scalar::Parse(other_type, repr)); - ASSERT_OK_AND_ASSIGN(auto cast_to_other, scalar->CastTo(other_type)) - ASSERT_EQ(*cast_to_other, *other_scalar); + ASSERT_OK_AND_ASSIGN(auto cast_to_other, Cast(scalar, other_type)) + ASSERT_EQ(*cast_to_other.scalar(), *other_scalar); - ASSERT_OK_AND_ASSIGN(auto cast_from_other, other_scalar->CastTo(type)) - ASSERT_EQ(*cast_from_other, *scalar); + ASSERT_OK_AND_ASSIGN(auto cast_from_other, Cast(other_scalar, type)) + ASSERT_EQ(*cast_from_other.scalar(), *scalar); } ASSERT_OK_AND_ASSIGN(auto cast_from_string, - StringScalar(std::string(repr)).CastTo(type)); - ASSERT_EQ(*cast_from_string, *scalar); + Cast(StringScalar(std::string(repr)), type)); + ASSERT_EQ(*cast_from_string.scalar(), *scalar); if (is_integer_type::value) { - ASSERT_OK_AND_ASSIGN(auto cast_to_string, scalar->CastTo(utf8())); - ASSERT_EQ( - std::string_view(*checked_cast(*cast_to_string).value), - repr); + ASSERT_OK_AND_ASSIGN(auto cast_to_string, Cast(scalar, utf8())); + ASSERT_EQ(std::string_view( + *checked_cast(*cast_to_string.scalar()).value), + repr); } } } @@ -1083,7 +1095,7 @@ void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& } template -class TestListScalar : public ::testing::Test { +class TestListLikeScalar : public ::testing::Test { public: using ScalarType = typename TypeTraits::ScalarType; @@ -1177,17 +1189,18 @@ class TestListScalar : public ::testing::Test { std::shared_ptr value_; }; -using ListScalarTestTypes = ::testing::Types; +using ListScalarTestTypes = ::testing::Types; -TYPED_TEST_SUITE(TestListScalar, ListScalarTestTypes); +TYPED_TEST_SUITE(TestListLikeScalar, ListScalarTestTypes); -TYPED_TEST(TestListScalar, Basics) { this->TestBasics(); } +TYPED_TEST(TestListLikeScalar, Basics) { this->TestBasics(); } -TYPED_TEST(TestListScalar, ValidateErrors) { this->TestValidateErrors(); } +TYPED_TEST(TestListLikeScalar, ValidateErrors) { this->TestValidateErrors(); } -TYPED_TEST(TestListScalar, Hashing) { this->TestHashing(); } +TYPED_TEST(TestListLikeScalar, Hashing) { this->TestHashing(); } -TYPED_TEST(TestListScalar, Cast) { this->TestCast(); } +TYPED_TEST(TestListLikeScalar, Cast) { this->TestCast(); } TEST(TestFixedSizeListScalar, ValidateErrors) { const auto ty = fixed_size_list(int16(), 3); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 1386075397e20..c317fe7aef44c 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -499,6 +499,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, } namespace { + template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, typename OffsetArrayType::value_type first_offset, @@ -608,6 +609,205 @@ std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, std::make_shared(), size, buffers, null_count); return std::make_shared(array_data); } + +// Helper for RandomArrayGenerator::ArrayOf: extract some C value from +// a given metadata key. +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ", + ArrowType::type_name())); + } + return output; +} + +/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm [1]. +/// +/// [1] https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] data The array to shuffle +template +void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) { + DCHECK_EQ(data->type->id(), ListViewType::type_id); + using offset_type = typename ListViewType::offset_type; + + auto* validity = data->GetMutableValues(0, 0); + auto* offsets = data->GetMutableValues(1); + auto* sizes = data->GetMutableValues(2); + + pcg32_fast rng(seed); + using UniformDist = std::uniform_int_distribution; + UniformDist dist; + for (int64_t i = data->length - 1; i > 0; --i) { + const auto j = dist(rng, UniformDist::param_type(0, i)); + if (ARROW_PREDICT_TRUE(i != j)) { + // Swap validity bits + if (validity) { + const bool valid_i = bit_util::GetBit(validity, data->offset + i); + const bool valid_j = bit_util::GetBit(validity, data->offset + i); + if (valid_i != valid_j) { + bit_util::SetBitTo(validity, data->offset + i, valid_j); + bit_util::SetBitTo(validity, data->offset + j, valid_i); + } + } + // Swap offsets and sizes + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); + } + } +} + +/// \brief Generate the list-view offsets based on a random buffer of sizes. +/// +/// The sizes buffer is an input of this function, but when force_empty_nulls is true, +/// some values on the sizes buffer can be set to 0. +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] mutable_sizes_array The array of sizes to use +/// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0 +/// \param[in] zero_undefined_offsets Whether to zero the offsets of list-views that have +/// 0 set as the size +/// \param[out] out_max_view_end The maximum value of the end of a list-view +template +std::shared_ptr ViewOffsetsFromLengthsArray( + SeedType seed, OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t* out_max_view_end, int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename OffsetArrayType::TypeClass; + + auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); + + BufferVector buffers{2}; + buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to + buffers[1] = *AllocateBuffer(sizeof(offset_type) * mutable_sizes_array.length(), + alignment, memory_pool); + auto offsets = buffers[1]->mutable_data_as(); + + offset_type offset = 0; + offset_type max_view_end = 0; + for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { + if (mutable_sizes_array.IsNull(i)) { + if (force_empty_nulls) { + sizes[i] = 0; + } + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + if (sizes[i] == 0) { + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + offsets[i] = offset; + DCHECK_LT(offset, std::numeric_limits::max() - sizes[i]); + offset += sizes[i]; + } + } + max_view_end = std::max(max_view_end, offsets[i] + sizes[i]); + } + *out_max_view_end = max_view_end; + + auto array_data = + ArrayData::Make(TypeTraits::type_singleton(), + mutable_sizes_array.length(), std::move(buffers), /*null_count=*/0); + return std::make_shared(std::move(array_data)); +} + +template +Result> ArrayOfListView(RAG& self, const Field& field, + int64_t length, int64_t alignment, + MemoryPool* memory_pool, + double null_probability) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename ArrayType::offset_type; + using OffsetArrayType = typename CTypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto zero_undefined_offsets = + GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); + const auto lengths = internal::checked_pointer_cast( + self.RAG::template Numeric( + length, min_length, max_length, null_probability)); + + int64_t max_view_end = 0; + const auto offsets = ViewOffsetsFromLengthsArray( + self.seed(), *lengths, force_empty_nulls, zero_undefined_offsets, &max_view_end, + alignment, memory_pool); + + const auto values = self.RAG::ArrayOf( + *internal::checked_pointer_cast(field.type())->value_field(), + /*values_length=*/max_view_end, alignment, memory_pool); + + ARROW_ASSIGN_OR_RAISE(auto list_view_array, + ArrayType::FromArrays(field.type(), *offsets, *lengths, *values)); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); + return list_view_array; +} + +template +Result> RandomListView(RAG& self, const Array& values, + int64_t length, double null_probability, + bool force_empty_nulls, double coverage, + int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename TypeClass::offset_type; + using OffsetArrayType = typename TypeTraits::OffsetArrayType; + using OffsetArrowType = typename OffsetArrayType::TypeClass; + + DCHECK_LE(values.length(), std::numeric_limits::max()); + DCHECK_LE(length, std::numeric_limits::max()); + + auto offsets_array = GenerateOffsets>( + self.seed(), length + 1, 0, static_cast(values.length()), null_probability, + force_empty_nulls, alignment, memory_pool); + auto* offsets = offsets_array->data()->template GetValues(1); + + // The buffers for the sizes array + BufferVector buffers{2}; + buffers[0] = NULLPTR; + buffers[1] = *AllocateBuffer(sizeof(offset_type) * length, alignment, memory_pool); + auto sizes = buffers[1]->mutable_data_as(); + + // Derive sizes from offsets taking coverage into account + pcg32_fast rng(self.seed()); + using NormalDist = std::normal_distribution; + NormalDist size_dist; + for (int64_t i = 0; i < length; ++i) { + const double mean_size = coverage * (offsets[i + 1] - offsets[i]); + const double sampled_size = + std::max(0.0, size_dist(rng, NormalDist::param_type{mean_size})); + // This creates a higher probability of offset[i] + size[i] being closer or equal to + // values.length(), but that skew is acceptable for the purposes of testing. + const auto size = std::min(static_cast(std::llround(sampled_size)), + static_cast(values.length() - offsets[i])); + sizes[i] = offsets_array->IsNull(i) && force_empty_nulls ? 0 : size; + } + + auto sizes_array_data = ArrayData::Make(TypeTraits::type_singleton(), + length, std::move(buffers), /*null_count=*/0); + auto sizes_array = std::make_shared(std::move(sizes_array_data)); + + ARROW_ASSIGN_OR_RAISE( + auto list_view_array, + ArrayType::FromArrays(*offsets_array, *sizes_array, values, memory_pool)); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); + return list_view_array; +} + } // namespace std::shared_ptr RandomArrayGenerator::Offsets( @@ -637,6 +837,24 @@ std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t s return *::arrow::ListArray::FromArrays(*offsets, values); } +std::shared_ptr RandomArrayGenerator::ListView(const Array& values, int64_t length, + double null_probability, + bool force_empty_nulls, + double coverage, int64_t alignment, + MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} + +std::shared_ptr RandomArrayGenerator::LargeListView( + const Array& values, int64_t length, double null_probability, bool force_empty_nulls, + double coverage, int64_t alignment, MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} + std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& keys, const std::shared_ptr& items, int64_t size, double null_probability, @@ -713,27 +931,6 @@ std::shared_ptr RandomArrayGenerator::DenseUnion(const ArrayVector& field return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes); } -namespace { - -// Helper for RandomArrayGenerator::ArrayOf: extract some C value from -// a given metadata key. -template ::ArrowType> -enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, - const std::string& key, - T default_value) { - if (!metadata) return default_value; - const auto index = metadata->FindKey(key); - if (index < 0) return default_value; - const auto& value = metadata->value(index); - T output{}; - if (!internal::ParseValue(value.data(), value.length(), &output)) { - ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); - } - return output; -} - -} // namespace - std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, int64_t size, double null_probability, @@ -811,6 +1008,12 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values); \ } +#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + return *ArrayOfListView(*this, field, length, alignment, memory_pool, \ + null_probability); \ + } + const double null_probability = field.nullable() ? GetMetadata(field.metadata().get(), "null_probability", 0.01) @@ -946,6 +1149,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(ListArray); + GENERATE_LIST_VIEW_CASE(ListViewArray); case Type::type::STRUCT: { ArrayVector child_arrays(field.type()->num_fields()); @@ -1069,6 +1273,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(LargeListArray); + GENERATE_LIST_VIEW_CASE(LargeListViewArray); default: break; @@ -1077,6 +1282,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t #undef GENERATE_INTEGRAL_CASE #undef GENERATE_FLOATING_CASE #undef GENERATE_LIST_CASE +#undef GENERATE_LIST_VIEW_CASE #undef VALIDATE_RANGE #undef VALIDATE_MIN_MAX diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index cbdac3baa0109..1d97a3ada724a 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -458,6 +458,43 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random ListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr ListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + + /// \brief Generate a random LargeListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr LargeListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, + double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random MapArray /// /// \param[in] keys The underlying keys array diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 951b654e56f73..a92ecf4e9c45b 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -70,7 +70,7 @@ class RandomArrayTest : public ::testing::TestWithParam { } bool HasList(const DataType& type) { - if (is_var_length_list(type.id())) { + if (is_var_length_list_like(type.id())) { return true; } for (const auto& child : type.fields()) { @@ -99,7 +99,7 @@ TEST_P(RandomArrayTest, GenerateArrayAlignment) { const int64_t alignment = 1024; auto field = GetField(); if (HasList(*field->type())) { - GTEST_SKIP() << "ListArray::FromArrays does not conserve buffer alignment"; + GTEST_SKIP() << "List[View]Array::FromArrays does not conserve buffer alignment"; } auto array = GenerateArray(*field, /*size=*/13, 0xDEADBEEF, alignment); AssertTypeEqual(field->type(), array->type()); @@ -177,6 +177,13 @@ auto values = ::testing::Values( key_value_metadata({{"force_empty_nulls", "true"}})), field("listint81024values", list(int8()), true, key_value_metadata({{"values", "1024"}})), + field("listviewint8", list_view(int8())), + field("listviewlistviewint8", list_view(list_view(int8()))), + field("listviewint8emptynulls", list_view(int8()), true, + key_value_metadata( + {{"force_empty_nulls", "true"}, {"zero_undefined_offsets", "true"}})), + field("listviewint81024values", list_view(int8()), true, + key_value_metadata({{"values", "1024"}})), field("structints", struct_({ field("int8", int8()), field("int16", int16()), @@ -201,7 +208,8 @@ auto values = ::testing::Values( field("fixedsizelist", fixed_size_list(int8(), 4)), field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), field("largebinary", large_binary()), - field("largelistlistint8", large_list(list(int8())))); + field("largelistlistint8", large_list(list(int8()))), + field("largelistviewlistviewint8", large_list_view(list_view(int8())))); INSTANTIATE_TEST_SUITE_P( TestRandomArrayGeneration, RandomArrayTest, values, @@ -400,6 +408,39 @@ TEST(TypeSpecificTests, ListLengths) { } } +TEST(TypeSpecificTests, ListViewLengths) { + { + auto field = + arrow::field("list_view", list_view(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), kExpectedLength); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list_view", large_list_view(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_EQ(array->length(), kExpectedLength); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + TEST(TypeSpecificTests, MapValues) { auto field = arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); @@ -500,6 +541,24 @@ TEST(RandomList, Basics) { } } +TEST(RandomListView, Basics) { + random::RandomArrayGenerator rng(42); + for (const double null_probability : {0.0, 0.1, 0.98}) { + SCOPED_TRACE("null_probability = " + std::to_string(null_probability)); + auto values = rng.Int16(1234, 0, 10000, null_probability); + auto array = rng.ListView(*values, 45, null_probability); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), 45); + const auto& list_view_array = checked_cast(*array); + ASSERT_EQ(list_view_array.values()->length(), 1234); + int64_t null_count = 0; + for (int64_t i = 0; i < array->length(); ++i) { + null_count += array->IsNull(i); + } + ASSERT_EQ(null_count, array->data()->null_count); + } +} + TEST(RandomChildFieldNullablity, List) { random::RandomArrayGenerator rng(42); @@ -513,6 +572,19 @@ TEST(RandomChildFieldNullablity, List) { ARROW_EXPECT_OK(batch->ValidateFull()); } +TEST(RandomChildFieldNullablity, ListView) { + random::RandomArrayGenerator rng(42); + + auto item = arrow::field("item", arrow::int8(), true); + auto nest_list_view_field = arrow::field("list_view", list_view(item), false); + auto list_view_field = arrow::field("list_view", list_view(nest_list_view_field), true); + auto array = rng.ArrayOf(*list_view_field, 428); + ARROW_EXPECT_OK(array->ValidateFull()); + + auto batch = rng.BatchOf({list_view_field}, 428); + ARROW_EXPECT_OK(batch->ValidateFull()); +} + TEST(RandomChildFieldNullablity, Struct) { random::RandomArrayGenerator rng(42); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f378bd974047d..62d2d61598dc8 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -140,6 +140,8 @@ std::vector AllTypeIds() { Type::STRUCT, Type::LIST, Type::LARGE_LIST, + Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::MAP, Type::DENSE_UNION, @@ -209,6 +211,8 @@ std::string ToString(Type::type id) { TO_STRING_CASE(STRUCT) TO_STRING_CASE(LIST) TO_STRING_CASE(LARGE_LIST) + TO_STRING_CASE(LIST_VIEW) + TO_STRING_CASE(LARGE_LIST_VIEW) TO_STRING_CASE(FIXED_SIZE_LIST) TO_STRING_CASE(MAP) TO_STRING_CASE(DENSE_UNION) @@ -992,6 +996,18 @@ std::string LargeListType::ToString() const { return s.str(); } +std::string ListViewType::ToString() const { + std::stringstream s; + s << "list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + +std::string LargeListViewType::ToString() const { + std::stringstream s; + s << "large_list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + MapType::MapType(std::shared_ptr key_type, std::shared_ptr item_type, bool keys_sorted) : MapType(::arrow::field("key", std::move(key_type), false), @@ -2888,6 +2904,38 @@ std::string LargeListType::ComputeFingerprint() const { return ""; } +std::string ListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + +std::string LargeListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + std::string MapType::ComputeFingerprint() const { const auto& key_fingerprint = key_type()->fingerprint(); const auto& item_fingerprint = item_type()->fingerprint(); @@ -3138,6 +3186,22 @@ std::shared_ptr fixed_size_list(const std::shared_ptr& value_fi return std::make_shared(value_field, list_size); } +std::shared_ptr list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + std::shared_ptr struct_(const FieldVector& fields) { return std::make_shared(fields); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a905192e4a54e..5b1331ab66919 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1174,6 +1174,71 @@ class ARROW_EXPORT LargeListType : public BaseListType { std::string ComputeFingerprint() const override; }; +/// \brief Type class for array of list views +class ARROW_EXPORT ListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LIST_VIEW; + using offset_type = int32_t; + + static constexpr const char* type_name() { return "list_view"; } + + // ListView can contain any other logical value type + explicit ListViewType(const std::shared_ptr& value_type) + : ListViewType(std::make_shared("item", value_type)) {} + + explicit ListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + +/// \brief Concrete type class for large list-view data +/// +/// LargeListViewType is like ListViewType but with 64-bit rather than 32-bit offsets and +/// sizes. +class ARROW_EXPORT LargeListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LARGE_LIST_VIEW; + using offset_type = int64_t; + + static constexpr const char* type_name() { return "large_list_view"; } + + // LargeListView can contain any other logical value type + explicit LargeListViewType(const std::shared_ptr& value_type) + : LargeListViewType(std::make_shared("item", value_type)) {} + + explicit LargeListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "large_list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for map data /// /// Map data is nested data where each value is a variable number of diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ca263b710317b..63eec10bf723b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -150,6 +150,16 @@ class LargeListArray; class LargeListBuilder; struct LargeListScalar; +class ListViewType; +class ListViewArray; +class ListViewBuilder; +struct ListViewScalar; + +class LargeListViewType; +class LargeListViewArray; +class LargeListViewBuilder; +struct LargeListViewScalar; + class MapType; class MapArray; class MapBuilder; @@ -432,6 +442,12 @@ struct Type { /// Bytes view type with 4-byte prefix and inline small string optimization BINARY_VIEW = 40, + /// A list of some logical data type represented by offset and size. + LIST_VIEW = 41, + + /// Like LIST_VIEW, but with 64-bit offsets and sizes + LARGE_LIST_VIEW = 42, + // Leave this at the end MAX_ID }; @@ -523,6 +539,19 @@ std::shared_ptr large_list(const std::shared_ptr& value_type); ARROW_EXPORT std::shared_ptr large_list(const std::shared_ptr& value_type); +/// \brief Create a ListViewType instance +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a ListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance +ARROW_EXPORT std::shared_ptr large_list_view( + std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr large_list_view(std::shared_ptr value_type); + /// \brief Create a MapType instance from its key and value DataTypes ARROW_EXPORT std::shared_ptr map(std::shared_ptr key_type, diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 273f8933fa577..009e557f82f68 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1553,6 +1553,46 @@ TEST(TestLargeListType, Basics) { ASSERT_EQ("large_list>", lt2.ToString()); } +TEST(TestListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + ListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LIST_VIEW); + + ASSERT_EQ("list_view", list_view_type.name()); + ASSERT_EQ("list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("list_view", lt->ToString()); + + ListViewType lt2(lt); + ASSERT_EQ("list_view>", lt2.ToString()); +} + +TEST(TestLargeListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + LargeListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LARGE_LIST_VIEW); + + ASSERT_EQ("large_list_view", list_view_type.name()); + ASSERT_EQ("large_list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("large_list_view", lt->ToString()); + + LargeListViewType lt2(lt); + ASSERT_EQ("large_list_view>", lt2.ToString()); +} + TEST(TestMapType, Basics) { auto md = key_value_metadata({"foo"}, {"foo value"}); @@ -1829,6 +1869,32 @@ TEST(TestListType, Equals) { ASSERT_FALSE(list_type.Equals(list_type_named, /*check_metadata=*/true)); } +TEST(TestListViewType, Equals) { + auto t1 = list_view(utf8()); + auto t2 = list_view(utf8()); + auto t3 = list_view(binary()); + auto t4 = list_view(field("item", utf8(), /*nullable=*/false)); + auto tl1 = large_list_view(binary()); + auto tl2 = large_list_view(binary()); + auto tl3 = large_list_view(float64()); + + AssertTypeEqual(*t1, *t2); + AssertTypeNotEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t4); + AssertTypeNotEqual(*t3, *tl1); + AssertTypeEqual(*tl1, *tl2); + AssertTypeNotEqual(*tl2, *tl3); + + std::shared_ptr vt = std::make_shared(); + std::shared_ptr inner_field = std::make_shared("non_default_name", vt); + + ListViewType list_view_type(vt); + ListViewType list_view_type_named(inner_field); + + AssertTypeEqual(list_view_type, list_view_type_named); + ASSERT_FALSE(list_view_type.Equals(list_view_type_named, /*check_metadata=*/true)); +} + TEST(TestListType, Metadata) { auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); @@ -1859,6 +1925,66 @@ TEST(TestListType, Metadata) { AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); } +TEST(TestListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = list_view(f1); + auto t2 = list_view(f2); + auto t3 = list_view(f3); + auto t4 = list_view(f4); + auto t5 = list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + +TEST(TestLargeListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = large_list_view(f1); + auto t2 = large_list_view(f2); + auto t3 = large_list_view(f3); + auto t4 = large_list_view(f4); + auto t5 = large_list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + TEST(TestNestedType, Equals) { auto create_struct = [](std::string inner_name, std::string struct_name) -> std::shared_ptr { @@ -2258,6 +2384,44 @@ TEST(TypesTest, TestRunEndEncodedType) { "run_end_encoded>"); } +TEST(TypesTest, TestListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "list_view"); +} + +TEST(TypesTest, TestLargeListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = large_list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = large_list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "large_list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "large_list_view"); +} + #define TEST_PREDICATE(all_types, type_predicate) \ for (auto type : all_types) { \ ASSERT_EQ(type_predicate(type->id()), type_predicate(*type)); \ @@ -2296,6 +2460,7 @@ TEST(TypesTest, TestMembership) { TEST_PREDICATE(all_types, is_fixed_width); TEST_PREDICATE(all_types, is_var_length_list); TEST_PREDICATE(all_types, is_list_like); + TEST_PREDICATE(all_types, is_var_length_list_like); TEST_PREDICATE(all_types, is_nested); TEST_PREDICATE(all_types, is_union); } diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index de328f322ad5f..ded54aff463c1 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -67,21 +67,23 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::BINARY: // Offsets may be cast to int32_t* case Type::DATE32: case Type::TIME32: - case Type::LIST: // Offsets may be cast to int32_t*, data is in child array - case Type::MAP: // This is a list array + case Type::LIST: // Offsets may be cast to int32_t* + case Type::LIST_VIEW: // Offsets and sizes may be cast to int32_t* + case Type::MAP: // Same as LIST case Type::INTERVAL_MONTHS: // Stored as int32_t* case Type::INTERVAL_DAY_TIME: // Stored as two contiguous 32-bit integers return 4; case Type::INT64: case Type::UINT64: case Type::DOUBLE: - case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::LARGE_BINARY: // Offsets may be cast to int64_t* - case Type::LARGE_LIST: // Offsets may be cast to int64_t* - case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::LARGE_BINARY: // Offsets may be cast to int64_t* + case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::LARGE_LIST: // Offsets may be cast to int64_t* + case Type::LARGE_LIST_VIEW: // Offsets and sizes may be cast to int64_t* case Type::DATE64: case Type::TIME64: case Type::TIMESTAMP: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 9d8cafacf397b..ed66c9367dc36 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -449,6 +449,7 @@ struct TypeTraits { using OffsetBuilderType = Int32Builder; using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; + using LargeType = LargeListType; }; template <> @@ -463,6 +464,31 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; +template <> +struct TypeTraits { + using ArrayType = ListViewArray; + using BuilderType = ListViewBuilder; + using ScalarType = ListViewScalar; + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; + constexpr static bool is_parameter_free = false; + using LargeType = LargeListViewType; +}; + +template <> +struct TypeTraits { + using ArrayType = LargeListViewArray; + using BuilderType = LargeListViewBuilder; + using ScalarType = LargeListViewScalar; + using OffsetType = Int64Type; + using OffsetArrayType = Int64Array; + using OffsetBuilderType = Int64Builder; + using OffsetScalarType = Int64Scalar; + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = MapArray; @@ -750,6 +776,13 @@ using is_list_type = template using enable_if_list_type = enable_if_t::value, R>; +template +using is_list_view_type = + std::disjunction, std::is_same>; + +template +using enable_if_list_view = enable_if_t::value, R>; + template using is_list_like_type = std::integral_constant::value || @@ -758,6 +791,14 @@ using is_list_like_type = template using enable_if_list_like = enable_if_t::value, R>; +template +using is_var_length_list_like_type = + std::disjunction, is_list_view_type>; + +template +using enable_if_var_length_list_like = + enable_if_t::value, R>; + template using is_struct_type = std::is_base_of; @@ -1303,6 +1344,39 @@ constexpr bool is_list_like(Type::type type_id) { return false; } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a var-length list or list-view like type +constexpr bool is_var_length_list_like(Type::type type_id) { + switch (type_id) { + case Type::LIST: + case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + case Type::MAP: + return true; + default: + break; + } + return false; +} + +/// \brief Check for a list-view type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a list-view type one +constexpr bool is_list_view(Type::type type_id) { + switch (type_id) { + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + return true; + default: + break; + } + return false; +} + /// \brief Check for a nested type /// /// \param[in] type_id the type-id to check @@ -1311,6 +1385,8 @@ constexpr bool is_nested(Type::type type_id) { switch (type_id) { case Type::LIST: case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: case Type::FIXED_SIZE_LIST: case Type::MAP: case Type::STRUCT: @@ -1403,12 +1479,14 @@ static inline int offset_bit_width(Type::type type_id) { case Type::STRING: case Type::BINARY: case Type::LIST: + case Type::LIST_VIEW: case Type::MAP: case Type::DENSE_UNION: return 32; case Type::LARGE_STRING: case Type::LARGE_BINARY: case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: return 64; default: break; @@ -1609,6 +1687,24 @@ static inline bool is_var_length_list(const DataType& type) { /// Convenience for checking using the type's id static inline bool is_list_like(const DataType& type) { return is_list_like(type.id()); } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type the type to check +/// \return whether type is a var-length list or list-view like type +/// +/// Convenience for checking using the type's id +static inline bool is_var_length_list_like(const DataType& type) { + return is_var_length_list_like(type.id()); +} + +/// \brief Check for a list-view type +/// +/// \param[in] type the type to check +/// \return whether type is a list-view type +/// +/// Convenience for checking using the type's id +static inline bool is_list_view(const DataType& type) { return is_list_view(type.id()); } + /// \brief Check for a nested type /// /// \param[in] type the type to check diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 2e9487dcf50c8..badf8a75078ed 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -55,6 +55,7 @@ add_arrow_test(utility-test int_util_test.cc ${IO_UTIL_TEST_SOURCES} iterator_test.cc + list_util_test.cc logging_test.cc queue_test.cc range_test.cc diff --git a/cpp/src/arrow/util/align_util.h b/cpp/src/arrow/util/align_util.h index 63df63749cf5c..71920e49f4aa2 100644 --- a/cpp/src/arrow/util/align_util.h +++ b/cpp/src/arrow/util/align_util.h @@ -74,7 +74,7 @@ namespace util { /// \brief Special alignment value to use data type-specific alignment /// /// If this is passed as the `alignment` in one of the CheckAlignment or EnsureAlignment -/// functions, then the function will ensure ensure each buffer is suitably aligned +/// functions, then the function will ensure each buffer is suitably aligned /// for the data type of the array. For example, given an int32 buffer the values /// buffer's address must be a multiple of 4. Given a large_string buffer the offsets /// buffer's address must be a multiple of 8. diff --git a/cpp/src/arrow/util/async_generator.h b/cpp/src/arrow/util/async_generator.h index a06be707f2fb0..f9bcd534567c6 100644 --- a/cpp/src/arrow/util/async_generator.h +++ b/cpp/src/arrow/util/async_generator.h @@ -715,7 +715,7 @@ AsyncGenerator MakeSerialReadaheadGenerator(AsyncGenerator source_generato /// generator() once before it returns. The returned generator will otherwise /// mirror the source. /// -/// This generator forwards aysnc-reentrant pressure to the source +/// This generator forwards async-reentrant pressure to the source /// This generator buffers one item (the first result) until it is delivered. template AsyncGenerator MakeAutoStartingGenerator(AsyncGenerator generator) { @@ -1843,7 +1843,7 @@ constexpr int kDefaultBackgroundQRestart = 16; /// active background thread task at any given time. You MUST transfer away from this /// background generator. Otherwise there could be a race condition if a callback on the /// background thread deletes the last consumer reference to the background generator. You -/// can transfer onto the same executor as the background thread, it is only neccesary to +/// can transfer onto the same executor as the background thread, it is only necessary to /// create a new thread task, not to switch executors. /// /// This generator is not async-reentrant diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc index 7fb99f167c605..2b74313db279b 100644 --- a/cpp/src/arrow/util/async_generator_test.cc +++ b/cpp/src/arrow/util/async_generator_test.cc @@ -719,7 +719,7 @@ TEST_P(MergedGeneratorTestFixture, MergedStress) { sources.push_back(source); } AsyncGenerator> source_gen = util::AsyncVectorIt(sources); - auto outer_gaurd = ExpectNotAccessedReentrantly(&source_gen); + auto outer_guard = ExpectNotAccessedReentrantly(&source_gen); auto merged = MakeMergedGenerator(source_gen, 4); ASSERT_FINISHES_OK_AND_ASSIGN(auto items, CollectAsyncGenerator(merged)); @@ -1095,7 +1095,7 @@ TEST_P(BackgroundGeneratorTestFixture, BadResult) { ASSERT_FINISHES_OK_AND_EQ(TestInt(1), generator()); // Next three results may or may not be valid. // The typical case is the call for TestInt(2) restarts a full queue and then maybe - // TestInt(3) and TestInt(4) arrive quickly enough to not get pre-empted or maybe + // TestInt(3) and TestInt(4) arrive quickly enough to not get preempted or maybe // they don't. // // A more bizarre, but possible, case is the checking thread falls behind the producer diff --git a/cpp/src/arrow/util/benchmark_util.h b/cpp/src/arrow/util/benchmark_util.h index 2a3dcf56f88bf..75639ac11ae41 100644 --- a/cpp/src/arrow/util/benchmark_util.h +++ b/cpp/src/arrow/util/benchmark_util.h @@ -161,7 +161,7 @@ class MemoryPoolMemoryManager : public benchmark::MemoryManager { int64_t new_default_allocations = default_pool->num_allocations() - global_allocations_start; - // Only record metrics metrics if (1) there were allocations and (2) we + // Only record metrics if (1) there were allocations and (2) we // recorded at least one. if (new_default_allocations > 0 && memory_pool->num_allocations() > 0) { if (new_default_allocations > memory_pool->num_allocations()) { diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h index f77cc3193624c..73a1ee8600fb4 100644 --- a/cpp/src/arrow/util/bit_block_counter.h +++ b/cpp/src/arrow/util/bit_block_counter.h @@ -200,7 +200,7 @@ class ARROW_EXPORT BitBlockCounter { int64_t offset_; }; -/// \brief A tool to iterate through a possibly non-existent validity bitmap, +/// \brief A tool to iterate through a possibly nonexistent validity bitmap, /// to allow us to write one code path for both the with-nulls and no-nulls /// cases without giving up a lot of performance. class ARROW_EXPORT OptionalBitBlockCounter { diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 15eadc9f2e7ea..e026dfec24065 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -924,7 +924,7 @@ TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) writer.Finish(); EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); }; - // 0ffset zero would not be a valid mask. + // Offset zero would not be a valid mask. check_with_set("11111111", 1); check_with_set("10111111", 2); check_with_set("10011111", 3); diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index 89006ba887b29..5526c87dbcaf2 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -256,7 +256,7 @@ class BitmapWordReader { } }; -/// \brief Index into a possibly non-existent bitmap +/// \brief Index into a possibly nonexistent bitmap struct OptionalBitIndexer { const uint8_t* bitmap; const int64_t offset; diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 5a3a7bad3d344..9105aaa2af411 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -16,7 +16,7 @@ // under the License. #include "arrow/util/bpacking_avx2.h" -#include "arrow/util/bpacking_simd256_generated.h" +#include "arrow/util/bpacking_simd256_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 08ccd3fcd4d86..3570bcc352b19 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -16,7 +16,7 @@ // under the License. #include "arrow/util/bpacking_avx512.h" -#include "arrow/util/bpacking_simd512_generated.h" +#include "arrow/util/bpacking_simd512_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index a0bb5dc7a9e7b..3ab6de75f4c63 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -16,7 +16,7 @@ // under the License. #include "arrow/util/bpacking_neon.h" -#include "arrow/util/bpacking_simd128_generated.h" +#include "arrow/util/bpacking_simd128_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_simd128_generated.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h similarity index 100% rename from cpp/src/arrow/util/bpacking_simd128_generated.h rename to cpp/src/arrow/util/bpacking_simd128_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking_simd256_generated.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h similarity index 100% rename from cpp/src/arrow/util/bpacking_simd256_generated.h rename to cpp/src/arrow/util/bpacking_simd256_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking_simd512_generated.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h similarity index 100% rename from cpp/src/arrow/util/bpacking_simd512_generated.h rename to cpp/src/arrow/util/bpacking_simd512_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py old mode 100644 new mode 100755 index 3a5e2d58daecd..581a19a53e58a --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -18,9 +18,9 @@ # under the License. # Usage: -# python bpacking_simd_codegen.py 128 > bpacking_simd128_generated.h -# python bpacking_simd_codegen.py 256 > bpacking_simd256_generated.h -# python bpacking_simd_codegen.py 512 > bpacking_simd512_generated.h +# python bpacking_simd_codegen.py 128 > bpacking_simd128_generated_internal.h +# python bpacking_simd_codegen.py 256 > bpacking_simd256_generated_internal.h +# python bpacking_simd_codegen.py 512 > bpacking_simd512_generated_internal.h from functools import partial import sys diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index ae85e2cfa81a3..4bc732ec24313 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -298,7 +298,7 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value } // Path for float. - // 1. Processed hierarchically to 32i blcok using the unpack intrinsics. + // 1. Processed hierarchically to 32i block using the unpack intrinsics. // 2. Pack 128i block using _mm256_permutevar8x32_epi32. // 3. Pack final 256i block with _mm256_permute2x128_si256. constexpr size_t kNumUnpack = 3U; @@ -534,7 +534,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101); } else { // Path for float. - // 1. Processed hierarchically to 32i blcok using the unpack intrinsics. + // 1. Processed hierarchically to 32i block using the unpack intrinsics. // 2. Pack 128i block using _mm256_permutevar8x32_epi32. // 3. Pack final 256i block with _mm256_permute2x128_si256. for (size_t i = 0; i < kNumStreams; ++i) diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 3ea27f57da881..c98f0a086738b 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -49,8 +49,8 @@ struct NamedFunc { }; // A simplistic reference implementation for validation -void RefererenceByteStreamSplitEncode(const uint8_t* src, int width, - const int64_t num_values, uint8_t* dest) { +void ReferenceByteStreamSplitEncode(const uint8_t* src, int width, + const int64_t num_values, uint8_t* dest) { for (int64_t i = 0; i < num_values; ++i) { for (int stream = 0; stream < width; ++stream) { dest[stream * num_values + i] = *src++; @@ -129,7 +129,7 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { protected: static void ReferenceEncode(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { - RefererenceByteStreamSplitEncode(raw_values, kWidth, num_values, output_buffer_raw); + ReferenceByteStreamSplitEncode(raw_values, kWidth, num_values, output_buffer_raw); } static std::vector MakeRandomInput(int64_t num_values) { diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 17e013c13ee0b..be957afab3c46 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -109,6 +109,7 @@ class LZ4Decompressor : public Decompressor { auto dst_capacity = static_cast(output_len); size_t ret; + DCHECK_NE(src, nullptr); ret = LZ4F_decompress(ctx_, dst, &dst_capacity, src, &src_size, nullptr /* options */); if (LZ4F_isError(ret)) { diff --git a/cpp/src/arrow/util/compression_test.cc b/cpp/src/arrow/util/compression_test.cc index 8f2a7f052ccb6..eeeedce17764f 100644 --- a/cpp/src/arrow/util/compression_test.cc +++ b/cpp/src/arrow/util/compression_test.cc @@ -368,6 +368,50 @@ TEST_P(CodecTest, CodecRoundtrip) { } } +TEST(CodecTest, CodecRoundtripGzipMembers) { +#ifndef ARROW_WITH_ZLIB + GTEST_SKIP() << "Test requires Zlib compression"; +#endif + std::unique_ptr gzip_codec; + ASSERT_OK_AND_ASSIGN(gzip_codec, Codec::Create(Compression::GZIP)); + + for (int data_size : {0, 10000, 100000}) { + int64_t compressed_size_p1, compressed_size_p2; + uint32_t p1_size = data_size / 4; + uint32_t p2_size = data_size - p1_size; + std::vector data_full = MakeRandomData(data_size); + std::vector data_p1(data_full.begin(), data_full.begin() + p1_size); + std::vector data_p2(data_full.begin() + p1_size, data_full.end()); + + int max_compressed_len_p1 = + static_cast(gzip_codec->MaxCompressedLen(p1_size, data_p1.data())); + int max_compressed_len_p2 = + static_cast(gzip_codec->MaxCompressedLen(p2_size, data_p2.data())); + std::vector compressed(max_compressed_len_p1 + max_compressed_len_p2); + + // Compress in 2 parts separately + ASSERT_OK_AND_ASSIGN(compressed_size_p1, + gzip_codec->Compress(p1_size, data_p1.data(), + max_compressed_len_p1, compressed.data())); + ASSERT_OK_AND_ASSIGN( + compressed_size_p2, + gzip_codec->Compress(p2_size, data_p2.data(), max_compressed_len_p2, + compressed.data() + compressed_size_p1)); + compressed.resize(compressed_size_p1 + compressed_size_p2); + + // Decompress the concatenated compressed gzip members + std::vector decompressed(data_size); + int64_t actual_decompressed_size; + ASSERT_OK_AND_ASSIGN( + actual_decompressed_size, + gzip_codec->Decompress(compressed.size(), compressed.data(), decompressed.size(), + decompressed.data())); + + ASSERT_EQ(data_size, actual_decompressed_size); + ASSERT_EQ(data_full, decompressed); + } +} + TEST(TestCodecMisc, SpecifyCompressionLevel) { struct CombinationOption { Compression::type codec; diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 2b38bdceab15b..a51f783be3ca7 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -381,6 +381,9 @@ class GZipCodec : public Codec { Result Decompress(int64_t input_length, const uint8_t* input, int64_t output_buffer_length, uint8_t* output) override { + int64_t read_input_bytes = 0; + int64_t decompressed_bytes = 0; + if (!decompressor_initialized_) { RETURN_NOT_OK(InitDecompressor()); } @@ -392,40 +395,46 @@ class GZipCodec : public Codec { return 0; } - // Reset the stream for this block - if (inflateReset(&stream_) != Z_OK) { - return ZlibErrorPrefix("zlib inflateReset failed: ", stream_.msg); - } + // inflate() will not automatically decode concatenated gzip members, keep calling + // inflate until reading all input data (GH-38271). + while (read_input_bytes < input_length) { + // Reset the stream for this block + if (inflateReset(&stream_) != Z_OK) { + return ZlibErrorPrefix("zlib inflateReset failed: ", stream_.msg); + } - int ret = 0; - // gzip can run in streaming mode or non-streaming mode. We only - // support the non-streaming use case where we present it the entire - // compressed input and a buffer big enough to contain the entire - // compressed output. In the case where we don't know the output, - // we just make a bigger buffer and try the non-streaming mode - // from the beginning again. - while (ret != Z_STREAM_END) { - stream_.next_in = const_cast(reinterpret_cast(input)); - stream_.avail_in = static_cast(input_length); - stream_.next_out = reinterpret_cast(output); - stream_.avail_out = static_cast(output_buffer_length); + int ret = 0; + // gzip can run in streaming mode or non-streaming mode. We only + // support the non-streaming use case where we present it the entire + // compressed input and a buffer big enough to contain the entire + // compressed output. In the case where we don't know the output, + // we just make a bigger buffer and try the non-streaming mode + // from the beginning again. + stream_.next_in = + const_cast(reinterpret_cast(input + read_input_bytes)); + stream_.avail_in = static_cast(input_length - read_input_bytes); + stream_.next_out = reinterpret_cast(output + decompressed_bytes); + stream_.avail_out = static_cast(output_buffer_length - decompressed_bytes); // We know the output size. In this case, we can use Z_FINISH // which is more efficient. ret = inflate(&stream_, Z_FINISH); - if (ret == Z_STREAM_END || ret != Z_OK) break; + if (ret == Z_OK) { + // Failure, buffer was too small + return Status::IOError("Too small a buffer passed to GZipCodec. InputLength=", + input_length, " OutputLength=", output_buffer_length); + } - // Failure, buffer was too small - return Status::IOError("Too small a buffer passed to GZipCodec. InputLength=", - input_length, " OutputLength=", output_buffer_length); - } + // Failure for some other reason + if (ret != Z_STREAM_END) { + return ZlibErrorPrefix("GZipCodec failed: ", stream_.msg); + } - // Failure for some other reason - if (ret != Z_STREAM_END) { - return ZlibErrorPrefix("GZipCodec failed: ", stream_.msg); + read_input_bytes += stream_.total_in; + decompressed_bytes += stream_.total_out; } - return stream_.total_out; + return decompressed_bytes; } int64_t MaxCompressedLen(int64_t input_length, diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 13709aa2f0cde..ce71def497161 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -312,7 +312,7 @@ struct Decimal128RealConversion return x; } - /// An appoximate conversion from Decimal128 to Real that guarantees: + /// An approximate conversion from Decimal128 to Real that guarantees: /// 1. If the decimal is an integer, the conversion is exact. /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact @@ -1006,7 +1006,7 @@ struct Decimal256RealConversion return x; } - /// An appoximate conversion from Decimal256 to Real that guarantees: + /// An approximate conversion from Decimal256 to Real that guarantees: /// 1. If the decimal is an integer, the conversion is exact. /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc new file mode 100644 index 0000000000000..feab2324a4029 --- /dev/null +++ b/cpp/src/arrow/util/dict_util.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/dict_util.h" +#include "arrow/array/array_dict.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace dict_util { + +namespace { + +template +int64_t LogicalNullCount(const ArraySpan& span) { + const auto* indices_null_bit_map = span.buffers[0].data; + const auto& dictionary_span = span.dictionary(); + const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data; + + using CType = typename IndexArrowType::c_type; + const CType* indices_data = span.GetValues(1); + int64_t null_count = 0; + for (int64_t i = 0; i < span.length; i++) { + if (indices_null_bit_map != nullptr && + !bit_util::GetBit(indices_null_bit_map, i + span.offset)) { + null_count++; + continue; + } + + CType current_index = indices_data[i]; + if (!bit_util::GetBit(dictionary_null_bit_map, + current_index + dictionary_span.offset)) { + null_count++; + } + } + return null_count; +} + +} // namespace + +int64_t LogicalNullCount(const ArraySpan& span) { + if (span.dictionary().GetNullCount() == 0 || span.length == 0) { + return span.GetNullCount(); + } + + const auto& dict_array_type = internal::checked_cast(*span.type); + switch (dict_array_type.index_type()->id()) { + case Type::UINT8: + return LogicalNullCount(span); + case Type::INT8: + return LogicalNullCount(span); + case Type::UINT16: + return LogicalNullCount(span); + case Type::INT16: + return LogicalNullCount(span); + case Type::UINT32: + return LogicalNullCount(span); + case Type::INT32: + return LogicalNullCount(span); + case Type::UINT64: + return LogicalNullCount(span); + default: + return LogicalNullCount(span); + } +} +} // namespace dict_util +} // namespace arrow diff --git a/cpp/src/arrow/util/dict_util.h b/cpp/src/arrow/util/dict_util.h new file mode 100644 index 0000000000000..a92733ae0f63d --- /dev/null +++ b/cpp/src/arrow/util/dict_util.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/data.h" + +namespace arrow { +namespace dict_util { + +int64_t LogicalNullCount(const ArraySpan& span); + +} // namespace dict_util +} // namespace arrow diff --git a/cpp/src/arrow/util/future.h b/cpp/src/arrow/util/future.h index 17c641c31c53d..283b581a5100a 100644 --- a/cpp/src/arrow/util/future.h +++ b/cpp/src/arrow/util/future.h @@ -435,7 +435,7 @@ class [[nodiscard]] Future { return MakeFinished(E::ToResult(std::move(s))); } - struct WrapResultyOnComplete { + struct WrapResultOnComplete { template struct Callback { void operator()(const FutureImpl& impl) && { @@ -461,7 +461,7 @@ class [[nodiscard]] Future { template using WrapOnComplete = typename std::conditional< detail::first_arg_is_status::value, WrapStatusyOnComplete, - WrapResultyOnComplete>::type::template Callback; + WrapResultOnComplete>::type::template Callback; /// \brief Consumer API: Register a callback to run when this future completes /// diff --git a/cpp/src/arrow/util/int_util.h b/cpp/src/arrow/util/int_util.h index 5ce9dc2820ee1..59a2ac7109a3c 100644 --- a/cpp/src/arrow/util/int_util.h +++ b/cpp/src/arrow/util/int_util.h @@ -113,7 +113,7 @@ Status CheckIntegersInRange(const ArraySpan& values, const Scalar& bound_lower, ARROW_EXPORT Status IntegersCanFit(const ArraySpan& values, const DataType& target_type); -/// \brief Convenience for boundschecking a single Scalar vlue +/// \brief Convenience for boundschecking a single Scalar value ARROW_EXPORT Status IntegersCanFit(const Scalar& value, const DataType& target_type); diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index ac92618ff6603..751ef28d415e0 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -1466,7 +1466,7 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, return StatusFromMmapErrno("ftruncate failed"); } // we set READ / WRITE flags on the new map, since we could only have - // unlarged a RW map in the first place + // enlarged a RW map in the first place *new_addr = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, fildes, 0); if (*new_addr == MAP_FAILED) { return StatusFromMmapErrno("mmap failed"); diff --git a/cpp/src/arrow/util/iterator.h b/cpp/src/arrow/util/iterator.h index 0eae7f6a8571b..5e716d0fd113d 100644 --- a/cpp/src/arrow/util/iterator.h +++ b/cpp/src/arrow/util/iterator.h @@ -50,7 +50,7 @@ struct IterationTraits { static T End() { return T(NULLPTR); } /// \brief Checks to see if the value is a terminal value. - /// A method is used here since T is not neccesarily comparable in many + /// A method is used here since T is not necessarily comparable in many /// cases even though it has a distinct final value static bool IsEnd(const T& val) { return val == End(); } }; diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc new file mode 100644 index 0000000000000..15196ff8c12cf --- /dev/null +++ b/cpp/src/arrow/util/list_util.cc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" + +namespace arrow::list_util { + +namespace internal { + +namespace { + +using arrow::internal::checked_cast; +using arrow::internal::ReverseSetBitRunReader; +using arrow::internal::SetBitRunReader; + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +std::optional MinViewOffset(const ArraySpan& input) { + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + + // Make an access to the sizes buffer only when strictly necessary. +#define MINIMIZE_MIN_VIEW_OFFSET(i) \ + auto offset = offsets[i]; \ + if (min_offset.has_value()) { \ + if (offset < *min_offset && sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } else { \ + if (sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } + + std::optional min_offset; + if (validity == nullptr) { + for (int64_t i = 0; i < input.length; i++) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } else { + SetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position; i < run.position + run.length; ++i) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } + } + return min_offset; + +#undef MINIMIZE_MIN_VIEW_OFFSET +} + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +int64_t MaxViewEnd(const ArraySpan& input) { + const auto values_length = input.child_data[0].length; + + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + +#define MAXIMIZE_MAX_VIEW_END(i) \ + const auto offset = static_cast(offsets[i]); \ + const offset_type size = sizes[i]; \ + if (size > 0) { \ + const int64_t end = offset + size; \ + if (end > max_end) { \ + if (end == values_length) { \ + return values_length; \ + } \ + max_end = end; \ + } \ + } + + int64_t max_end = 0; + if (validity == nullptr) { + for (int64_t i = input.length - 1; i >= 0; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } else { + ReverseSetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position + run.length - 1; i >= run.position; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } + } + return max_end; + +#undef MAXIMIZE_MAX_VIEW_END +} + +template +std::pair RangeOfValuesUsedByListView(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + if (input.length == 0 || input.null_count == input.length) { + return {0, 0}; + } + const auto min_offset = MinViewOffset(input); + // If all list-views are empty, min_offset will be std::nullopt. + if (!min_offset.has_value()) { + return {0, 0}; + } + const int64_t max_end = MaxViewEnd(input); + return {*min_offset, max_end - *min_offset}; +} + +template +std::pair RangeOfValuesUsedByList(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + if (input.length == 0) { + return {0, 0}; + } + const auto* offsets = input.buffers[1].data_as(); + const int64_t min_offset = offsets[input.offset]; + const int64_t max_end = offsets[input.offset + input.length]; + return {min_offset, max_end - min_offset}; +} + +template +int64_t SumOfListSizes(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, offsets](int64_t run_start, int64_t run_length) { + sum += offsets[run_start + run_length + 1] - offsets[run_start]; + }); + return sum; +} + +template +int64_t SumOfListViewSizes(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* sizes = input.GetValues(2); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, sizes](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + sum += sizes[i]; + } + }); + return sum; +} + +} // namespace + +Result> RangeOfValuesUsed(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return RangeOfValuesUsedByList(input); + case Type::MAP: + return RangeOfValuesUsedByList(input); + case Type::LARGE_LIST: + return RangeOfValuesUsedByList(input); + case Type::LIST_VIEW: + return RangeOfValuesUsedByListView(input); + case Type::LARGE_LIST_VIEW: + return RangeOfValuesUsedByListView(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "RangeOfValuesUsed: input is not a var-length list-like array"); +} + +Result SumOfLogicalListSizes(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return SumOfListSizes(input); + case Type::MAP: + return SumOfListSizes(input); + case Type::LARGE_LIST: + return SumOfListSizes(input); + case Type::LIST_VIEW: + return SumOfListViewSizes(input); + case Type::LARGE_LIST_VIEW: + return SumOfListViewSizes(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "SumOfLogicalListSizes: input is not a var-length list-like array"); +} + +} // namespace internal + +} // namespace arrow::list_util diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h new file mode 100644 index 0000000000000..58deb8019d941 --- /dev/null +++ b/cpp/src/arrow/util/list_util.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/data.h" +#include "arrow/result.h" + +namespace arrow { +namespace list_util { +namespace internal { + +/// \brief Calculate the smallest continuous range of values used by the +/// var-length list-like input (list, map and list-view types). +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return A pair of (offset, length) describing the range +ARROW_EXPORT Result> RangeOfValuesUsed( + const ArraySpan& input); + +/// \brief Calculate the sum of the sizes of all valid lists or list-views +/// +/// This is usually the same as the length of the RangeOfValuesUsed() range, but +/// it can be: +/// - Smaller: when the child array contains many values that are not +/// referenced by the lists or list-views in the parent array +/// - Greater: when the list-views share child array ranges +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return The sum of all list or list-view sizes +ARROW_EXPORT Result SumOfLogicalListSizes(const ArraySpan& input); + +} // namespace internal + +} // namespace list_util +} // namespace arrow diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc new file mode 100644 index 0000000000000..4021180b2bef3 --- /dev/null +++ b/cpp/src/arrow/util/list_util_test.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/builder_nested.h" +#include "arrow/util/list_util.h" + +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +using ListAndListViewTypes = + ::testing::Types; + +template +class TestListUtils : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_.reset(checked_cast(tmp.release())); + } + + void TestRangeOfValuesUsed() { + std::shared_ptr result; + + // These list-views are built manually with the list-view builders instead + // of using something like ArrayFromJSON() because we want to test the + // RangeOfValuesUsed() function's ability to handle arrays containing + // overlapping list-views. + + // Empty list-like array + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(auto range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // List-like array with only nulls + ASSERT_OK(builder_->AppendNulls(3)); + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // Array with nulls and non-nulls (starting at a non-zero offset) + Int16Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } + std::shared_ptr array; + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + ASSERT_EQ(range.second, 5); + + // Overlapping list-views + vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + // -- used range ends here -- + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } else { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + } + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_ARRAYS_EQUAL( + *array, *ArrayFromJSON( + type_, "[null, [0, 1, 2, 3, 4, 5], [1, 2], null, [4], null, null]")); + // Check the range + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + if constexpr (is_list_view_type::value) { + ASSERT_EQ(range.second, 6); + } else { + ASSERT_EQ(range.second, 9); + } + // Check the sum of logical sizes as well + ASSERT_OK_AND_ASSIGN(int64_t sum_of_logical_sizes, + list_util::internal::SumOfLogicalListSizes(*array->data())); + ASSERT_EQ(sum_of_logical_sizes, 9); + } + + protected: + MemoryPool* pool_ = default_memory_pool(); + std::shared_ptr type_; + std::shared_ptr value_type_; + std::shared_ptr builder_; +}; + +TYPED_TEST_SUITE(TestListUtils, ListAndListViewTypes); + +TYPED_TEST(TestListUtils, RangeOfValuesUsed) { this->TestRangeOfValuesUsed(); } + +} // namespace arrow diff --git a/cpp/src/arrow/util/logging.cc b/cpp/src/arrow/util/logging.cc index 6d275fa2864a2..9c68982a3d59f 100644 --- a/cpp/src/arrow/util/logging.cc +++ b/cpp/src/arrow/util/logging.cc @@ -148,7 +148,7 @@ void ArrowLog::StartArrowLog(const std::string& app_name, #ifdef ARROW_USE_GLOG int mapped_severity_threshold = GetMappedSeverity(severity_threshold_); google::SetStderrLogging(mapped_severity_threshold); - // Enble log file if log_dir is not empty. + // Enable log file if log_dir is not empty. if (!log_dir.empty()) { auto dir_ends_with_slash = log_dir; if (log_dir[log_dir.length() - 1] != '/') { diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 819de5eb60c63..83fb4d3a9a738 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -69,7 +69,7 @@ int64_t FindPhysicalIndexImpl(PhysicalIndexFinder& self, int64_t i) DCHECK_LT(i, self.array_span.length); const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length; DCHECK_LT(self.last_physical_index, run_ends_size); - // This access to self.run_ends[last_physical_index] is alwas safe because: + // This access to self.run_ends[last_physical_index] is always safe because: // 1. 0 <= i < array_span.length() implies there is at least one run and the initial // value 0 will be safe to index with. // 2. last_physical_index > 0 is always the result of a valid call to diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index 2b7940154a50b..a3e745ba830a3 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -128,7 +128,7 @@ int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_off /// run-ends) necessary to represent the logical range of values from /// offset to length. /// -/// Avoid calling this function if the physical length can be estabilished in +/// Avoid calling this function if the physical length can be established in /// some other way (e.g. when iterating over the runs sequentially until the /// end). This function uses binary-search, so it has a O(log N) cost. template @@ -217,7 +217,7 @@ ARROW_EXPORT int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, /// run-ends) necessary to represent the logical range of values from /// offset to length. /// -/// Avoid calling this function if the physical length can be estabilished in +/// Avoid calling this function if the physical length can be established in /// some other way (e.g. when iterating over the runs sequentially until the /// end). This function uses binary-search, so it has a O(log N) cost. ARROW_EXPORT int64_t FindPhysicalLength(const ArraySpan& span); diff --git a/cpp/src/arrow/util/ree_util_test.cc b/cpp/src/arrow/util/ree_util_test.cc index 966cbd8f386f1..08a6a39b98d5c 100644 --- a/cpp/src/arrow/util/ree_util_test.cc +++ b/cpp/src/arrow/util/ree_util_test.cc @@ -101,7 +101,7 @@ TYPED_TEST_P(ReeUtilTest, PhysicalLength) { ASSERT_EQ(internal::FindPhysicalLength(run_ends246, 4, 0, 7), 0); } -TYPED_TEST_P(ReeUtilTest, MergedRunsInterator) { +TYPED_TEST_P(ReeUtilTest, MergedRunsIteratorTest) { // Construct the following two test arrays with a lot of different offsets to test the // REE iterator: left: // @@ -387,7 +387,7 @@ TYPED_TEST_P(ReeUtilTest, MergedRunsInterator) { } REGISTER_TYPED_TEST_SUITE_P(ReeUtilTest, PhysicalIndex, PhysicalLength, - MergedRunsInterator); + MergedRunsIteratorTest); using RunEndsTypes = testing::Types; INSTANTIATE_TYPED_TEST_SUITE_P(ReeUtilTest, ReeUtilTest, RunEndsTypes); diff --git a/cpp/src/arrow/util/rle_encoding.h b/cpp/src/arrow/util/rle_encoding.h index 3a517d24b46c6..e0f5690062a04 100644 --- a/cpp/src/arrow/util/rle_encoding.h +++ b/cpp/src/arrow/util/rle_encoding.h @@ -61,7 +61,7 @@ namespace util { /// on a byte boundary without padding. /// Given that we know it is a multiple of 8, we store the number of 8-groups rather than /// the actual number of encoded ints. (This means that the total number of encoded values -/// can not be determined from the encoded data, since the number of values in the last +/// cannot be determined from the encoded data, since the number of values in the last /// group may not be a multiple of 8). For the last group of literal runs, we pad /// the group to 8 with zeros. This allows for 8 at a time decoding on the read side /// without the need for additional checks. diff --git a/cpp/src/arrow/util/string_test.cc b/cpp/src/arrow/util/string_test.cc index 5f8054f12161f..f222b938d5a32 100644 --- a/cpp/src/arrow/util/string_test.cc +++ b/cpp/src/arrow/util/string_test.cc @@ -136,7 +136,7 @@ TEST(SplitString, OuterLeftAndOuterRightDelimiter) { EXPECT_EQ(parts[4], ""); } -TEST(SplitString, OnlyDemiliter) { +TEST(SplitString, OnlyDelimiter) { std::string input = ":"; auto parts = SplitString(input, ':'); ASSERT_EQ(parts.size(), 2); diff --git a/cpp/src/arrow/util/tdigest.cc b/cpp/src/arrow/util/tdigest.cc index ee84a5ef6b2f5..ca864d98361b4 100644 --- a/cpp/src/arrow/util/tdigest.cc +++ b/cpp/src/arrow/util/tdigest.cc @@ -341,7 +341,7 @@ class TDigest::TDigestImpl { double total_weight() const { return total_weight_; } private: - // must be delcared before merger_, see constructor initialization list + // must be declared before merger_, see constructor initialization list const uint32_t delta_; TDigestMerger<> merger_; diff --git a/cpp/src/arrow/util/tdigest_test.cc b/cpp/src/arrow/util/tdigest_test.cc index f501285b34f5a..63395b676a61f 100644 --- a/cpp/src/arrow/util/tdigest_test.cc +++ b/cpp/src/arrow/util/tdigest_test.cc @@ -44,7 +44,7 @@ TEST(TDigestTest, SingleValue) { TDigest td; td.Add(value); ASSERT_OK(td.Validate()); - // all quantiles equal to same single vaue + // all quantiles equal to same single value for (double q = 0; q <= 1; q += 0.1) { EXPECT_EQ(td.Quantile(q), value); } diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc index ad30ca2e8052d..8f43bb8dec367 100644 --- a/cpp/src/arrow/util/thread_pool_test.cc +++ b/cpp/src/arrow/util/thread_pool_test.cc @@ -699,7 +699,7 @@ TEST_F(TestThreadPool, SetCapacity) { } ASSERT_OK(gating_task->WaitForRunning(3)); SleepFor(0.001); // Sleep a bit just to make sure it isn't making any threads - ASSERT_EQ(pool->GetActualCapacity(), 3); // maxxed out + ASSERT_EQ(pool->GetActualCapacity(), 3); // maxed out // The tasks have not finished yet, increasing the desired capacity // should spawn threads immediately. diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index e057f6b12fb1b..cca99033c9350 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -63,6 +63,8 @@ ARRAY_VISITOR_DEFAULT(MonthIntervalArray) ARRAY_VISITOR_DEFAULT(DurationArray) ARRAY_VISITOR_DEFAULT(ListArray) ARRAY_VISITOR_DEFAULT(LargeListArray) +ARRAY_VISITOR_DEFAULT(ListViewArray) +ARRAY_VISITOR_DEFAULT(LargeListViewArray) ARRAY_VISITOR_DEFAULT(MapArray) ARRAY_VISITOR_DEFAULT(FixedSizeListArray) ARRAY_VISITOR_DEFAULT(StructArray) @@ -117,6 +119,8 @@ TYPE_VISITOR_DEFAULT(Decimal128Type) TYPE_VISITOR_DEFAULT(Decimal256Type) TYPE_VISITOR_DEFAULT(ListType) TYPE_VISITOR_DEFAULT(LargeListType) +TYPE_VISITOR_DEFAULT(ListViewType) +TYPE_VISITOR_DEFAULT(LargeListViewType) TYPE_VISITOR_DEFAULT(MapType) TYPE_VISITOR_DEFAULT(FixedSizeListType) TYPE_VISITOR_DEFAULT(StructType) @@ -170,6 +174,8 @@ SCALAR_VISITOR_DEFAULT(Decimal128Scalar) SCALAR_VISITOR_DEFAULT(Decimal256Scalar) SCALAR_VISITOR_DEFAULT(ListScalar) SCALAR_VISITOR_DEFAULT(LargeListScalar) +SCALAR_VISITOR_DEFAULT(ListViewScalar) +SCALAR_VISITOR_DEFAULT(LargeListViewScalar) SCALAR_VISITOR_DEFAULT(MapScalar) SCALAR_VISITOR_DEFAULT(FixedSizeListScalar) SCALAR_VISITOR_DEFAULT(StructScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 650b0e7ee0a30..75ef46ae4e5c3 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -64,6 +64,8 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const Decimal256Array& array); virtual Status Visit(const ListArray& array); virtual Status Visit(const LargeListArray& array); + virtual Status Visit(const ListViewArray& array); + virtual Status Visit(const LargeListViewArray& array); virtual Status Visit(const MapArray& array); virtual Status Visit(const FixedSizeListArray& array); virtual Status Visit(const StructArray& array); @@ -115,6 +117,8 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const Decimal256Type& type); virtual Status Visit(const ListType& type); virtual Status Visit(const LargeListType& type); + virtual Status Visit(const ListViewType& scalar); + virtual Status Visit(const LargeListViewType& scalar); virtual Status Visit(const MapType& type); virtual Status Visit(const FixedSizeListType& type); virtual Status Visit(const StructType& type); @@ -166,6 +170,8 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const Decimal256Scalar& scalar); virtual Status Visit(const ListScalar& scalar); virtual Status Visit(const LargeListScalar& scalar); + virtual Status Visit(const ListViewScalar& scalar); + virtual Status Visit(const LargeListViewScalar& scalar); virtual Status Visit(const MapScalar& scalar); virtual Status Visit(const FixedSizeListScalar& scalar); virtual Status Visit(const StructScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 4b57abe53ff14..cbb081bfed311 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -59,6 +59,8 @@ namespace arrow { ACTION(Decimal256); \ ACTION(List); \ ACTION(LargeList); \ + ACTION(ListView); \ + ACTION(LargeListView); \ ACTION(Map); \ ACTION(FixedSizeList); \ ACTION(Struct); \ diff --git a/cpp/src/gandiva/function_holder_maker_registry.cc b/cpp/src/gandiva/function_holder_maker_registry.cc index bb93402475ae8..37ca1fbf6c3fe 100644 --- a/cpp/src/gandiva/function_holder_maker_registry.cc +++ b/cpp/src/gandiva/function_holder_maker_registry.cc @@ -41,9 +41,7 @@ arrow::Status FunctionHolderMakerRegistry::Register(const std::string& name, template static arrow::Result HolderMaker(const FunctionNode& node) { - std::shared_ptr derived_instance; - ARROW_RETURN_NOT_OK(HolderType::Make(node, &derived_instance)); - return derived_instance; + return HolderType::Make(node); } arrow::Result FunctionHolderMakerRegistry::Make( diff --git a/cpp/src/gandiva/interval_holder.cc b/cpp/src/gandiva/interval_holder.cc index d63a11a10d341..70f779263525f 100644 --- a/cpp/src/gandiva/interval_holder.cc +++ b/cpp/src/gandiva/interval_holder.cc @@ -258,26 +258,26 @@ int64_t IntervalDaysHolder::operator()(ExecutionContext* ctx, const char* data, return 0; } -Status IntervalDaysHolder::Make(const FunctionNode& node, - std::shared_ptr* holder) { +Result> IntervalDaysHolder::Make( + const FunctionNode& node) { const std::string function_name("castINTERVALDAY"); - return IntervalHolder::Make(node, holder, function_name); + return IntervalHolder::Make(node, function_name); } -Status IntervalDaysHolder::Make(int32_t suppress_errors, - std::shared_ptr* holder) { - return IntervalHolder::Make(suppress_errors, holder); +Result> IntervalDaysHolder::Make( + int32_t suppress_errors) { + return IntervalHolder::Make(suppress_errors); } -Status IntervalYearsHolder::Make(const FunctionNode& node, - std::shared_ptr* holder) { +Result> IntervalYearsHolder::Make( + const FunctionNode& node) { const std::string function_name("castINTERVALYEAR"); - return IntervalHolder::Make(node, holder, function_name); + return IntervalHolder::Make(node, function_name); } -Status IntervalYearsHolder::Make(int32_t suppress_errors, - std::shared_ptr* holder) { - return IntervalHolder::Make(suppress_errors, holder); +Result> IntervalYearsHolder::Make( + int32_t suppress_errors) { + return IntervalHolder::Make(suppress_errors); } // The operator will cast a generic string defined by the user into an interval of months. diff --git a/cpp/src/gandiva/interval_holder.h b/cpp/src/gandiva/interval_holder.h index 38d8e9f86a9bc..0a6a988025406 100644 --- a/cpp/src/gandiva/interval_holder.h +++ b/cpp/src/gandiva/interval_holder.h @@ -39,8 +39,8 @@ class GANDIVA_EXPORT IntervalHolder : public FunctionHolder { ~IntervalHolder() override = default; protected: - static Status Make(const FunctionNode& node, std::shared_ptr* holder, - const std::string& function_name) { + static Result> Make(const FunctionNode& node, + const std::string& function_name) { ARROW_RETURN_IF(node.children().size() != 1 && node.children().size() != 2, Status::Invalid(function_name + " requires one or two parameters")); @@ -63,14 +63,11 @@ class GANDIVA_EXPORT IntervalHolder : public FunctionHolder { suppress_errors = std::get(literal_suppress_errors->holder()); } - return Make(suppress_errors, holder); + return Make(suppress_errors); } - static Status Make(int32_t suppress_errors, std::shared_ptr* holder) { - auto lholder = std::make_shared(suppress_errors); - - *holder = lholder; - return Status::OK(); + static Result> Make(int32_t suppress_errors) { + return std::make_shared(suppress_errors); } explicit IntervalHolder(int32_t supress_errors) : suppress_errors_(supress_errors) {} @@ -94,11 +91,9 @@ class GANDIVA_EXPORT IntervalDaysHolder : public IntervalHolder* holder); + static Result> Make(const FunctionNode& node); - static Status Make(int32_t suppress_errors, - std::shared_ptr* holder); + static Result> Make(int32_t suppress_errors); /// Cast a generic string to an interval int64_t operator()(ExecutionContext* ctx, const char* data, int32_t data_len, @@ -131,11 +126,9 @@ class GANDIVA_EXPORT IntervalYearsHolder : public IntervalHolder* holder); + static Result> Make(const FunctionNode& node); - static Status Make(int32_t suppress_errors, - std::shared_ptr* holder); + static Result> Make(int32_t suppress_errors); /// Cast a generic string to an interval int32_t operator()(ExecutionContext* ctx, const char* data, int32_t data_len, diff --git a/cpp/src/gandiva/interval_holder_test.cc b/cpp/src/gandiva/interval_holder_test.cc index fbfd6335f4568..59a10cace8cd5 100644 --- a/cpp/src/gandiva/interval_holder_test.cc +++ b/cpp/src/gandiva/interval_holder_test.cc @@ -20,8 +20,8 @@ #include #include -#include +#include "arrow/testing/gtest_util.h" #include "gandiva/execution_context.h" namespace gandiva { @@ -32,14 +32,8 @@ class TestIntervalHolder : public ::testing::Test { }; TEST_F(TestIntervalHolder, TestMatchAllPeriods) { - std::shared_ptr interval_days_holder; - std::shared_ptr interval_years_holder; - - auto status = IntervalDaysHolder::Make(0, &interval_days_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - status = IntervalYearsHolder::Make(0, &interval_years_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto interval_days_holder, IntervalDaysHolder::Make(0)); + EXPECT_OK_AND_ASSIGN(auto interval_years_holder, IntervalYearsHolder::Make(0)); auto& cast_interval_day = *interval_days_holder; auto& cast_interval_year = *interval_years_holder; @@ -289,14 +283,8 @@ TEST_F(TestIntervalHolder, TestMatchAllPeriods) { } TEST_F(TestIntervalHolder, TestMatchErrorsForCastIntervalDay) { - std::shared_ptr interval_days_holder; - std::shared_ptr interval_years_holder; - - auto status = IntervalDaysHolder::Make(0, &interval_days_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - status = IntervalYearsHolder::Make(0, &interval_years_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto interval_days_holder, IntervalDaysHolder::Make(0)); + EXPECT_OK_AND_ASSIGN(auto interval_years_holder, IntervalYearsHolder::Make(0)); auto& cast_interval_day = *interval_days_holder; auto& cast_interval_year = *interval_years_holder; @@ -440,12 +428,8 @@ TEST_F(TestIntervalHolder, TestMatchErrorsForCastIntervalDay) { } TEST_F(TestIntervalHolder, TestUsingWeekFormatterForCastIntervalDay) { - std::shared_ptr interval_holder; - - auto status = IntervalDaysHolder::Make(0, &interval_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - auto& cast_interval_day = *interval_holder; + EXPECT_OK_AND_ASSIGN(auto interval_days_holder, IntervalDaysHolder::Make(0)); + auto& cast_interval_day = *interval_days_holder; bool out_valid; std::string data("P1W"); @@ -465,12 +449,8 @@ TEST_F(TestIntervalHolder, TestUsingWeekFormatterForCastIntervalDay) { } TEST_F(TestIntervalHolder, TestUsingCompleteFormatterForCastIntervalDay) { - std::shared_ptr interval_holder; - - auto status = IntervalDaysHolder::Make(0, &interval_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - auto& cast_interval_day = *interval_holder; + EXPECT_OK_AND_ASSIGN(auto interval_days_holder, IntervalDaysHolder::Make(0)); + auto& cast_interval_day = *interval_days_holder; bool out_valid; std::string data("1742461111"); @@ -528,11 +508,7 @@ TEST_F(TestIntervalHolder, TestUsingCompleteFormatterForCastIntervalDay) { } TEST_F(TestIntervalHolder, TestUsingCompleteFormatterForCastIntervalYear) { - std::shared_ptr interval_years_holder; - - auto status = IntervalYearsHolder::Make(0, &interval_years_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - + EXPECT_OK_AND_ASSIGN(auto interval_years_holder, IntervalYearsHolder::Make(0)); auto& cast_interval_years = *interval_years_holder; bool out_valid; diff --git a/cpp/src/gandiva/random_generator_holder.cc b/cpp/src/gandiva/random_generator_holder.cc index 3d395741d70c2..8f80c5826d936 100644 --- a/cpp/src/gandiva/random_generator_holder.cc +++ b/cpp/src/gandiva/random_generator_holder.cc @@ -19,14 +19,13 @@ #include "gandiva/node.h" namespace gandiva { -Status RandomGeneratorHolder::Make(const FunctionNode& node, - std::shared_ptr* holder) { +Result> RandomGeneratorHolder::Make( + const FunctionNode& node) { ARROW_RETURN_IF(node.children().size() > 1, Status::Invalid("'random' function requires at most one parameter")); if (node.children().size() == 0) { - *holder = std::shared_ptr(new RandomGeneratorHolder()); - return Status::OK(); + return std::shared_ptr(new RandomGeneratorHolder()); } auto literal = dynamic_cast(node.children().at(0).get()); @@ -38,8 +37,7 @@ Status RandomGeneratorHolder::Make(const FunctionNode& node, literal_type != arrow::Type::INT32, Status::Invalid("'random' function requires an int32 literal as parameter")); - *holder = std::shared_ptr(new RandomGeneratorHolder( + return std::shared_ptr(new RandomGeneratorHolder( literal->is_null() ? 0 : std::get(literal->holder()))); - return Status::OK(); } } // namespace gandiva diff --git a/cpp/src/gandiva/random_generator_holder.h b/cpp/src/gandiva/random_generator_holder.h index 65b6607e87840..ffab725aa7fc7 100644 --- a/cpp/src/gandiva/random_generator_holder.h +++ b/cpp/src/gandiva/random_generator_holder.h @@ -34,8 +34,7 @@ class GANDIVA_EXPORT RandomGeneratorHolder : public FunctionHolder { public: ~RandomGeneratorHolder() override = default; - static Status Make(const FunctionNode& node, - std::shared_ptr* holder); + static Result> Make(const FunctionNode& node); double operator()() { return distribution_(generator_); } diff --git a/cpp/src/gandiva/random_generator_holder_test.cc b/cpp/src/gandiva/random_generator_holder_test.cc index 4b16c1b7d0d9a..77b2750f2e95d 100644 --- a/cpp/src/gandiva/random_generator_holder_test.cc +++ b/cpp/src/gandiva/random_generator_holder_test.cc @@ -21,38 +21,35 @@ #include +#include "arrow/testing/gtest_util.h" + namespace gandiva { class TestRandGenHolder : public ::testing::Test { public: - FunctionNode BuildRandFunc() { return FunctionNode("random", {}, arrow::float64()); } + FunctionNode BuildRandFunc() { return {"random", {}, arrow::float64()}; } FunctionNode BuildRandWithSeedFunc(int32_t seed, bool seed_is_null) { auto seed_node = std::make_shared(arrow::int32(), LiteralHolder(seed), seed_is_null); - return FunctionNode("rand", {seed_node}, arrow::float64()); + return {"rand", {seed_node}, arrow::float64()}; } }; TEST_F(TestRandGenHolder, NoSeed) { - std::shared_ptr rand_gen_holder; FunctionNode rand_func = BuildRandFunc(); - auto status = RandomGeneratorHolder::Make(rand_func, &rand_gen_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder, RandomGeneratorHolder::Make(rand_func)); auto& random = *rand_gen_holder; EXPECT_NE(random(), random()); } TEST_F(TestRandGenHolder, WithValidEqualSeeds) { - std::shared_ptr rand_gen_holder_1; - std::shared_ptr rand_gen_holder_2; FunctionNode rand_func_1 = BuildRandWithSeedFunc(12, false); FunctionNode rand_func_2 = BuildRandWithSeedFunc(12, false); - auto status = RandomGeneratorHolder::Make(rand_func_1, &rand_gen_holder_1); - EXPECT_EQ(status.ok(), true) << status.message(); - status = RandomGeneratorHolder::Make(rand_func_2, &rand_gen_holder_2); - EXPECT_EQ(status.ok(), true) << status.message(); + + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_1, RandomGeneratorHolder::Make(rand_func_1)); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_2, RandomGeneratorHolder::Make(rand_func_2)); auto& random_1 = *rand_gen_holder_1; auto& random_2 = *rand_gen_holder_2; @@ -65,18 +62,12 @@ TEST_F(TestRandGenHolder, WithValidEqualSeeds) { } TEST_F(TestRandGenHolder, WithValidSeeds) { - std::shared_ptr rand_gen_holder_1; - std::shared_ptr rand_gen_holder_2; - std::shared_ptr rand_gen_holder_3; FunctionNode rand_func_1 = BuildRandWithSeedFunc(11, false); FunctionNode rand_func_2 = BuildRandWithSeedFunc(12, false); FunctionNode rand_func_3 = BuildRandWithSeedFunc(-12, false); - auto status = RandomGeneratorHolder::Make(rand_func_1, &rand_gen_holder_1); - EXPECT_EQ(status.ok(), true) << status.message(); - status = RandomGeneratorHolder::Make(rand_func_2, &rand_gen_holder_2); - EXPECT_EQ(status.ok(), true) << status.message(); - status = RandomGeneratorHolder::Make(rand_func_3, &rand_gen_holder_3); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_1, RandomGeneratorHolder::Make(rand_func_1)); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_2, RandomGeneratorHolder::Make(rand_func_2)); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_3, RandomGeneratorHolder::Make(rand_func_3)); auto& random_1 = *rand_gen_holder_1; auto& random_2 = *rand_gen_holder_2; @@ -86,14 +77,10 @@ TEST_F(TestRandGenHolder, WithValidSeeds) { } TEST_F(TestRandGenHolder, WithInValidSeed) { - std::shared_ptr rand_gen_holder_1; - std::shared_ptr rand_gen_holder_2; FunctionNode rand_func_1 = BuildRandWithSeedFunc(12, true); FunctionNode rand_func_2 = BuildRandWithSeedFunc(0, false); - auto status = RandomGeneratorHolder::Make(rand_func_1, &rand_gen_holder_1); - EXPECT_EQ(status.ok(), true) << status.message(); - status = RandomGeneratorHolder::Make(rand_func_2, &rand_gen_holder_2); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_1, RandomGeneratorHolder::Make(rand_func_1)); + EXPECT_OK_AND_ASSIGN(auto rand_gen_holder_2, RandomGeneratorHolder::Make(rand_func_2)); auto& random_1 = *rand_gen_holder_1; auto& random_2 = *rand_gen_holder_2; diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 30d68cbc87ab3..03a4af90d8991 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -48,9 +48,9 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { global_checked = true; } - std::shared_ptr holder; - auto status = Make(node, &holder); - if (status.ok()) { + auto maybe_holder = Make(node); + if (maybe_holder.ok()) { + auto holder = *maybe_holder; std::string& pattern = holder->pattern_; auto literal_type = node.children().at(1)->return_type(); @@ -83,7 +83,7 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { return node; } -Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { +Result> LikeHolder::Make(const FunctionNode& node) { ARROW_RETURN_IF(node.children().size() != 2 && node.children().size() != 3, Status::Invalid("'like' function requires two or three parameters")); @@ -102,10 +102,10 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h if (node.descriptor()->name() == "ilike") { regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. - return Make(std::get(literal->holder()), holder, regex_op); + return Make(std::get(literal->holder()), regex_op); } if (node.children().size() == 2) { - return Make(std::get(literal->holder()), holder); + return Make(std::get(literal->holder())); } else { auto escape_char = dynamic_cast(node.children().at(2).get()); ARROW_RETURN_IF( @@ -118,12 +118,11 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h Status::Invalid( "'like' function requires a string literal as the third parameter")); return Make(std::get(literal->holder()), - std::get(escape_char->holder()), holder); + std::get(escape_char->holder())); } } -Status LikeHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder) { +Result> LikeHolder::Make(const std::string& sql_pattern) { std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); @@ -132,12 +131,11 @@ Status LikeHolder::Make(const std::string& sql_pattern, Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); - *holder = std::move(lholder); - return Status::OK(); + return lholder; } -Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, - std::shared_ptr* holder) { +Result> LikeHolder::Make(const std::string& sql_pattern, + const std::string& escape_char) { ARROW_RETURN_IF(escape_char.length() > 1, Status::Invalid("The length of escape char ", escape_char, " in 'like' function is greater than 1")); @@ -154,28 +152,24 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); - *holder = std::move(lholder); - return Status::OK(); + return lholder; } -Status LikeHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder, RE2::Options regex_op) { +Result> LikeHolder::Make(const std::string& sql_pattern, + RE2::Options regex_op) { std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - std::shared_ptr lholder; - lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); - *holder = std::move(lholder); - return Status::OK(); + return lholder; } -Status ReplaceHolder::Make(const FunctionNode& node, - std::shared_ptr* holder) { +Result> ReplaceHolder::Make(const FunctionNode& node) { ARROW_RETURN_IF(node.children().size() != 3, Status::Invalid("'replace' function requires three parameters")); @@ -190,18 +184,17 @@ Status ReplaceHolder::Make(const FunctionNode& node, Status::Invalid( "'replace' function requires a string literal as the second parameter")); - return Make(std::get(literal->holder()), holder); + return Make(std::get(literal->holder())); } -Status ReplaceHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder) { +Result> ReplaceHolder::Make( + const std::string& sql_pattern) { auto lholder = std::shared_ptr(new ReplaceHolder(sql_pattern)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed with: ", lholder->regex_.error())); - *holder = std::move(lholder); - return Status::OK(); + return lholder; } void ReplaceHolder::return_error(ExecutionContext* context, std::string& data, @@ -211,8 +204,7 @@ void ReplaceHolder::return_error(ExecutionContext* context, std::string& data, context->set_error_msg(err_msg.c_str()); } -Status ExtractHolder::Make(const FunctionNode& node, - std::shared_ptr* holder) { +Result> ExtractHolder::Make(const FunctionNode& node) { ARROW_RETURN_IF(node.children().size() != 3, Status::Invalid("'extract' function requires three parameters")); @@ -221,18 +213,17 @@ Status ExtractHolder::Make(const FunctionNode& node, literal == nullptr || !IsArrowStringLiteral(literal->return_type()->id()), Status::Invalid("'extract' function requires a literal as the second parameter")); - return ExtractHolder::Make(std::get(literal->holder()), holder); + return ExtractHolder::Make(std::get(literal->holder())); } -Status ExtractHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder) { +Result> ExtractHolder::Make( + const std::string& sql_pattern) { auto lholder = std::shared_ptr(new ExtractHolder(sql_pattern)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed with: ", lholder->regex_.error())); - *holder = std::move(lholder); - return Status::OK(); + return lholder; } const char* ExtractHolder::operator()(ExecutionContext* ctx, const char* user_input, diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h index ecf4095f3d473..36d942510bb5b 100644 --- a/cpp/src/gandiva/regex_functions_holder.h +++ b/cpp/src/gandiva/regex_functions_holder.h @@ -35,15 +35,15 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { public: ~LikeHolder() override = default; - static Status Make(const FunctionNode& node, std::shared_ptr* holder); + static Result> Make(const FunctionNode& node); - static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); + static Result> Make(const std::string& sql_pattern); - static Status Make(const std::string& sql_pattern, const std::string& escape_char, - std::shared_ptr* holder); + static Result> Make(const std::string& sql_pattern, + const std::string& escape_char); - static Status Make(const std::string& sql_pattern, std::shared_ptr* holder, - RE2::Options regex_op); + static Result> Make(const std::string& sql_pattern, + RE2::Options regex_op); // Try and optimise a function node with a "like" pattern. static const FunctionNode TryOptimize(const FunctionNode& node); @@ -66,10 +66,9 @@ class GANDIVA_EXPORT ReplaceHolder : public FunctionHolder { public: ~ReplaceHolder() override = default; - static Status Make(const FunctionNode& node, std::shared_ptr* holder); + static Result> Make(const FunctionNode& node); - static Status Make(const std::string& sql_pattern, - std::shared_ptr* holder); + static Result> Make(const std::string& sql_pattern); /// Return a new string with the pattern that matched the regex replaced for /// the replace_input parameter. @@ -130,10 +129,9 @@ class GANDIVA_EXPORT ExtractHolder : public FunctionHolder { public: ~ExtractHolder() override = default; - static Status Make(const FunctionNode& node, std::shared_ptr* holder); + static Result> Make(const FunctionNode& node); - static Status Make(const std::string& sql_pattern, - std::shared_ptr* holder); + static Result> Make(const std::string& sql_pattern); /// Extracts the matching text from a string using a regex const char* operator()(ExecutionContext* ctx, const char* user_input, diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 930f3a7ade718..534be5987a233 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -19,7 +19,8 @@ #include #include #include -#include + +#include "arrow/testing/gtest_util.h" #include "gandiva/regex_util.h" namespace gandiva { @@ -31,7 +32,7 @@ class TestLikeHolder : public ::testing::Test { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); - return FunctionNode("like", {field, pattern_node}, arrow::boolean()); + return {"like", {field, pattern_node}, arrow::boolean()}; } FunctionNode BuildLike(std::string pattern, char escape_char) { @@ -40,16 +41,12 @@ class TestLikeHolder : public ::testing::Test { std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); auto escape_char_node = std::make_shared( arrow::utf8(), LiteralHolder(std::string(1, escape_char)), false); - return FunctionNode("like", {field, pattern_node, escape_char_node}, - arrow::boolean()); + return {"like", {field, pattern_node, escape_char_node}, arrow::boolean()}; } }; TEST_F(TestLikeHolder, TestMatchAny) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab%", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab%", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("ab")); @@ -61,10 +58,7 @@ TEST_F(TestLikeHolder, TestMatchAny) { } TEST_F(TestLikeHolder, TestMatchOne) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab_", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab_", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("abc")); @@ -76,10 +70,7 @@ TEST_F(TestLikeHolder, TestMatchOne) { } TEST_F(TestLikeHolder, TestPcreSpecial) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make(".*ab_", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like(".*abc")); // . and * aren't special in sql regex @@ -88,34 +79,26 @@ TEST_F(TestLikeHolder, TestPcreSpecial) { TEST_F(TestLikeHolder, TestRegexEscape) { std::string res; - auto status = RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res); - EXPECT_TRUE(status.ok()) << status.message(); + ARROW_EXPECT_OK(RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res)); EXPECT_EQ(res, "%hello_abc.def#"); } TEST_F(TestLikeHolder, TestDot) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("abc.", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("abc.", regex_op)); auto& like = *like_holder; EXPECT_FALSE(like("abcd")); } TEST_F(TestLikeHolder, TestMatchSubString) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("%abc%", "\\", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\")); auto& like = *like_holder; EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\")); auto& like_reserved_char = *like_holder; EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d")); @@ -190,10 +173,7 @@ TEST_F(TestLikeHolder, TestOptimise) { } TEST_F(TestLikeHolder, TestMatchOneEscape) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab\\_", "\\", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\")); auto& like = *like_holder; @@ -207,10 +187,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { } TEST_F(TestLikeHolder, TestMatchManyEscape) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab\\%", "\\", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\")); auto& like = *like_holder; @@ -224,10 +201,7 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { } TEST_F(TestLikeHolder, TestMatchEscape) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\")); auto& like = *like_holder; @@ -237,10 +211,7 @@ TEST_F(TestLikeHolder, TestMatchEscape) { } TEST_F(TestLikeHolder, TestEmptyEscapeChar) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab\\_", "", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "")); auto& like = *like_holder; @@ -252,10 +223,7 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { } TEST_F(TestLikeHolder, TestMultipleEscapeChar) { - std::shared_ptr like_holder; - - auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder); - EXPECT_EQ(status.ok(), false) << status.message(); + ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\").status()); } class TestILikeHolder : public ::testing::Test { @@ -265,16 +233,14 @@ class TestILikeHolder : public ::testing::Test { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); - return FunctionNode("ilike", {field, pattern_node}, arrow::boolean()); + return {"ilike", {field, pattern_node}, arrow::boolean()}; } }; TEST_F(TestILikeHolder, TestMatchAny) { - std::shared_ptr like_holder; - regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make("ab%", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab%", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("ab")); @@ -286,11 +252,8 @@ TEST_F(TestILikeHolder, TestMatchAny) { } TEST_F(TestILikeHolder, TestMatchOne) { - std::shared_ptr like_holder; - regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make("Ab_", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("Ab_", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("abc")); @@ -302,11 +265,8 @@ TEST_F(TestILikeHolder, TestMatchOne) { } TEST_F(TestILikeHolder, TestPcreSpecial) { - std::shared_ptr like_holder; - regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make(".*aB_", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex @@ -314,11 +274,8 @@ TEST_F(TestILikeHolder, TestPcreSpecial) { } TEST_F(TestILikeHolder, TestDot) { - std::shared_ptr like_holder; - regex_op.set_case_sensitive(false); - auto status = LikeHolder::Make("aBc.", &like_holder, regex_op); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("aBc.", regex_op)); auto& like = *like_holder; EXPECT_FALSE(like("abcd")); @@ -330,10 +287,7 @@ class TestReplaceHolder : public ::testing::Test { }; TEST_F(TestReplaceHolder, TestMultipleReplace) { - std::shared_ptr replace_holder; - - auto status = ReplaceHolder::Make("ana", &replace_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const replace_holder, ReplaceHolder::Make("ana")); std::string input_string = "banana"; std::string replace_string; @@ -378,10 +332,7 @@ TEST_F(TestReplaceHolder, TestMultipleReplace) { } TEST_F(TestReplaceHolder, TestNoMatchPattern) { - std::shared_ptr replace_holder; - - auto status = ReplaceHolder::Make("ana", &replace_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const replace_holder, ReplaceHolder::Make("ana")); std::string input_string = "apple"; std::string replace_string; @@ -398,10 +349,7 @@ TEST_F(TestReplaceHolder, TestNoMatchPattern) { } TEST_F(TestReplaceHolder, TestReplaceSameSize) { - std::shared_ptr replace_holder; - - auto status = ReplaceHolder::Make("a", &replace_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const replace_holder, ReplaceHolder::Make("a")); std::string input_string = "ananindeua"; std::string replace_string = "b"; @@ -418,11 +366,7 @@ TEST_F(TestReplaceHolder, TestReplaceSameSize) { } TEST_F(TestReplaceHolder, TestReplaceInvalidPattern) { - std::shared_ptr replace_holder; - - auto status = ReplaceHolder::Make("+", &replace_holder); - EXPECT_EQ(status.ok(), false) << status.message(); - + ASSERT_RAISES(Invalid, ReplaceHolder::Make("+")); execution_context_.Reset(); } @@ -433,11 +377,8 @@ class TestExtractHolder : public ::testing::Test { }; TEST_F(TestExtractHolder, TestSimpleExtract) { - std::shared_ptr extract_holder; - // Pattern to match of two group of letters - auto status = ExtractHolder::Make(R"((\w+) (\w+))", &extract_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto extract_holder, ExtractHolder::Make(R"((\w+) (\w+))")); std::string input_string = "John Doe"; int32_t extract_index = 2; // Retrieve the surname @@ -469,8 +410,7 @@ TEST_F(TestExtractHolder, TestSimpleExtract) { EXPECT_EQ(out_length, 9); EXPECT_EQ(ret_as_str, "Paul Test"); - status = ExtractHolder::Make(R"((\w+) (\w+) - (\d+))", &extract_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(extract_holder, ExtractHolder::Make(R"((\w+) (\w+) - (\d+))")); auto& extract2 = *extract_holder; @@ -502,8 +442,7 @@ TEST_F(TestExtractHolder, TestSimpleExtract) { EXPECT_EQ(ret_as_str, "John Doe - 124"); // Pattern to match only numbers - status = ExtractHolder::Make(R"(((\w+)))", &extract_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(extract_holder, ExtractHolder::Make(R"(((\w+)))")); auto& extract_numbers = *extract_holder; @@ -569,11 +508,8 @@ TEST_F(TestExtractHolder, TestSimpleExtract) { } TEST_F(TestExtractHolder, TestNoMatches) { - std::shared_ptr extract_holder; - // Pattern to match of two group of letters - auto status = ExtractHolder::Make(R"((\w+) (\w+))", &extract_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto extract_holder, ExtractHolder::Make(R"((\w+) (\w+))")); std::string input_string = "John"; int32_t extract_index = 2; // The regex will not match with the input string @@ -588,8 +524,7 @@ TEST_F(TestExtractHolder, TestNoMatches) { EXPECT_FALSE(execution_context_.has_error()); // Pattern to match only numbers - status = ExtractHolder::Make(R"(\d+)", &extract_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(extract_holder, ExtractHolder::Make(R"(\d+)")); auto& extract_numbers = *extract_holder; @@ -616,11 +551,8 @@ TEST_F(TestExtractHolder, TestNoMatches) { } TEST_F(TestExtractHolder, TestInvalidRange) { - std::shared_ptr extract_holder; - // Pattern to match of two group of letters - auto status = ExtractHolder::Make(R"((\w+) (\w+))", &extract_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto const extract_holder, ExtractHolder::Make(R"((\w+) (\w+))")); std::string input_string = "John Doe"; int32_t extract_index = -1; @@ -650,17 +582,11 @@ TEST_F(TestExtractHolder, TestInvalidRange) { } TEST_F(TestExtractHolder, TestExtractInvalidPattern) { - std::shared_ptr extract_holder; - - auto status = ExtractHolder::Make("+", &extract_holder); - EXPECT_EQ(status.ok(), false) << status.message(); - + ASSERT_RAISES(Invalid, ExtractHolder::Make("+")); execution_context_.Reset(); } TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { - std::shared_ptr extract_holder; - // Create function with incorrect number of params auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = std::make_shared( @@ -668,10 +594,10 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { auto function_node = FunctionNode("regexp_extract", {field, pattern_node}, arrow::utf8()); - auto status = ExtractHolder::Make(function_node, &extract_holder); - EXPECT_EQ(status.ok(), false); - EXPECT_THAT(status.message(), - ::testing::HasSubstr("'extract' function requires three parameters")); + auto extract_holder = ExtractHolder::Make(function_node); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("'extract' function requires three parameters"), + extract_holder.status()); execution_context_.Reset(); @@ -682,11 +608,12 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { function_node = FunctionNode("regexp_extract", {field, pattern_node, index_node}, arrow::utf8()); - status = ExtractHolder::Make(function_node, &extract_holder); - EXPECT_EQ(status.ok(), false); - EXPECT_THAT(status.message(), - ::testing::HasSubstr( - "'extract' function requires a literal as the second parameter")); + extract_holder = ExtractHolder::Make(function_node); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + ::testing::HasSubstr( + "'extract' function requires a literal as the second parameter"), + extract_holder.status()); execution_context_.Reset(); @@ -698,11 +625,12 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) { function_node = FunctionNode("regexp_extract", {field, pattern_as_node, index_node}, arrow::utf8()); - status = ExtractHolder::Make(function_node, &extract_holder); - EXPECT_EQ(status.ok(), false); - EXPECT_THAT(status.message(), - ::testing::HasSubstr( - "'extract' function requires a literal as the second parameter")); + extract_holder = ExtractHolder::Make(function_node); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + ::testing::HasSubstr( + "'extract' function requires a literal as the second parameter"), + extract_holder.status()); execution_context_.Reset(); } diff --git a/cpp/src/gandiva/to_date_holder.cc b/cpp/src/gandiva/to_date_holder.cc index 27a16d1779960..76f16f0cb1b74 100644 --- a/cpp/src/gandiva/to_date_holder.cc +++ b/cpp/src/gandiva/to_date_holder.cc @@ -28,8 +28,7 @@ namespace gandiva { -Status ToDateHolder::Make(const FunctionNode& node, - std::shared_ptr* holder) { +Result> ToDateHolder::Make(const FunctionNode& node) { if (node.children().size() != 2 && node.children().size() != 3) { return Status::Invalid("'to_date' function requires two or three parameters"); } @@ -51,7 +50,7 @@ Status ToDateHolder::Make(const FunctionNode& node, if (node.children().size() == 3) { auto literal_suppress_errors = dynamic_cast(node.children().at(2).get()); - if (literal_pattern == nullptr) { + if (literal_suppress_errors == nullptr) { return Status::Invalid( "The (optional) third parameter to 'to_date' function needs to an integer " "literal to indicate whether to suppress the error"); @@ -66,17 +65,15 @@ Status ToDateHolder::Make(const FunctionNode& node, suppress_errors = std::get(literal_suppress_errors->holder()); } - return Make(pattern, suppress_errors, holder); + return Make(pattern, suppress_errors); } -Status ToDateHolder::Make(const std::string& sql_pattern, int32_t suppress_errors, - std::shared_ptr* holder) { +Result> ToDateHolder::Make(const std::string& sql_pattern, + int32_t suppress_errors) { std::shared_ptr transformed_pattern; ARROW_RETURN_NOT_OK(DateUtils::ToInternalFormat(sql_pattern, &transformed_pattern)); - auto lholder = std::shared_ptr( - new ToDateHolder(*(transformed_pattern.get()), suppress_errors)); - *holder = lholder; - return Status::OK(); + return std::shared_ptr( + new ToDateHolder(*transformed_pattern, suppress_errors)); } int64_t ToDateHolder::operator()(ExecutionContext* context, const char* data, diff --git a/cpp/src/gandiva/to_date_holder.h b/cpp/src/gandiva/to_date_holder.h index 1211b6a3043ad..ac13f7a31b34c 100644 --- a/cpp/src/gandiva/to_date_holder.h +++ b/cpp/src/gandiva/to_date_holder.h @@ -35,10 +35,10 @@ class GANDIVA_EXPORT ToDateHolder : public FunctionHolder { public: ~ToDateHolder() override = default; - static Status Make(const FunctionNode& node, std::shared_ptr* holder); + static Result> Make(const FunctionNode& node); - static Status Make(const std::string& sql_pattern, int32_t suppress_errors, - std::shared_ptr* holder); + static Result> Make(const std::string& sql_pattern, + int32_t suppress_errors); /// Return true if the data matches the pattern. int64_t operator()(ExecutionContext* context, const char* data, int data_len, diff --git a/cpp/src/gandiva/to_date_holder_test.cc b/cpp/src/gandiva/to_date_holder_test.cc index 99036817d8e65..1612d9b4f440e 100644 --- a/cpp/src/gandiva/to_date_holder_test.cc +++ b/cpp/src/gandiva/to_date_holder_test.cc @@ -16,7 +16,6 @@ // under the License. #include -#include #include "arrow/testing/gtest_util.h" @@ -30,14 +29,18 @@ namespace gandiva { class TestToDateHolder : public ::testing::Test { public: - FunctionNode BuildToDate(std::string pattern) { + FunctionNode BuildToDate(std::string pattern, + std::shared_ptr suppress_error_node = nullptr) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); - auto suppress_error_node = - std::make_shared(arrow::int32(), LiteralHolder(0), false); - return FunctionNode("to_date_utf8_utf8_int32", - {field, pattern_node, suppress_error_node}, arrow::int64()); + if (suppress_error_node == nullptr) { + suppress_error_node = + std::make_shared(arrow::int32(), LiteralHolder(0), false); + } + return {"to_date_utf8_utf8_int32", + {field, pattern_node, std::move(suppress_error_node)}, + arrow::int64()}; } protected: @@ -45,8 +48,7 @@ class TestToDateHolder : public ::testing::Test { }; TEST_F(TestToDateHolder, TestSimpleDateTime) { - std::shared_ptr to_date_holder; - ASSERT_OK(ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 1, &to_date_holder)); + EXPECT_OK_AND_ASSIGN(auto to_date_holder, ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 1)); auto& to_date = *to_date_holder; bool out_valid; @@ -86,8 +88,7 @@ TEST_F(TestToDateHolder, TestSimpleDateTime) { } TEST_F(TestToDateHolder, TestSimpleDate) { - std::shared_ptr to_date_holder; - ASSERT_OK(ToDateHolder::Make("YYYY-MM-DD", 1, &to_date_holder)); + EXPECT_OK_AND_ASSIGN(auto to_date_holder, ToDateHolder::Make("YYYY-MM-DD", 1)); auto& to_date = *to_date_holder; bool out_valid; @@ -119,10 +120,7 @@ TEST_F(TestToDateHolder, TestSimpleDate) { } TEST_F(TestToDateHolder, TestSimpleDateTimeError) { - std::shared_ptr to_date_holder; - - auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 0, &to_date_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + EXPECT_OK_AND_ASSIGN(auto to_date_holder, ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 0)); auto& to_date = *to_date_holder; bool out_valid; @@ -132,8 +130,7 @@ TEST_F(TestToDateHolder, TestSimpleDateTimeError) { EXPECT_EQ(0, millis_since_epoch); std::string expected_error = "Error parsing value 1986-01-40 01:01:01 +0800 for given format"; - EXPECT_TRUE(execution_context_.get_error().find(expected_error) != std::string::npos) - << status.message(); + EXPECT_TRUE(execution_context_.get_error().find(expected_error) != std::string::npos); // not valid should not return error execution_context_.Reset(); @@ -143,15 +140,12 @@ TEST_F(TestToDateHolder, TestSimpleDateTimeError) { } TEST_F(TestToDateHolder, TestSimpleDateTimeMakeError) { - std::shared_ptr to_date_holder; // reject time stamps for now. - auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS tzo", 0, &to_date_holder); - EXPECT_EQ(status.IsInvalid(), true) << status.message(); + ASSERT_RAISES(Invalid, ToDateHolder::Make("YYYY-MM-DD HH:MI:SS tzo", 0).status()); } TEST_F(TestToDateHolder, TestSimpleDateYearMonth) { - std::shared_ptr to_date_holder; - ASSERT_OK(ToDateHolder::Make("YYYY-MM", 1, &to_date_holder)); + EXPECT_OK_AND_ASSIGN(auto to_date_holder, ToDateHolder::Make("YYYY-MM", 1)); auto& to_date = *to_date_holder; bool out_valid; @@ -167,8 +161,7 @@ TEST_F(TestToDateHolder, TestSimpleDateYearMonth) { } TEST_F(TestToDateHolder, TestSimpleDateYear) { - std::shared_ptr to_date_holder; - ASSERT_OK(ToDateHolder::Make("YYYY", 1, &to_date_holder)); + EXPECT_OK_AND_ASSIGN(auto to_date_holder, ToDateHolder::Make("YYYY", 1)); auto& to_date = *to_date_holder; bool out_valid; @@ -178,4 +171,14 @@ TEST_F(TestToDateHolder, TestSimpleDateYear) { EXPECT_EQ(millis_since_epoch, 915148800000); } +TEST_F(TestToDateHolder, TestMakeFromFunctionNode) { + auto to_date_func = BuildToDate("YYYY"); + EXPECT_OK_AND_ASSIGN(auto to_date_holder, ToDateHolder::Make(to_date_func)); +} + +TEST_F(TestToDateHolder, TestMakeFromInvalidSurpressParamFunctionNode) { + auto non_literal_param = std::make_shared(arrow::field("in", arrow::utf8())); + auto to_date_func = BuildToDate("YYYY", std::move(non_literal_param)); + ASSERT_RAISES(Invalid, ToDateHolder::Make(to_date_func).status()); +} } // namespace gandiva diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp index 86188581e0c42..8932c4a4f8d19 100644 --- a/cpp/src/generated/parquet_types.cpp +++ b/cpp/src/generated/parquet_types.cpp @@ -1,5 +1,5 @@ /** - * Autogenerated by Thrift Compiler (0.18.1) + * Autogenerated by Thrift Compiler (0.19.0) * * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING * @generated @@ -615,6 +615,197 @@ std::string to_string(const BoundaryOrder::type& val) { } +SizeStatistics::~SizeStatistics() noexcept { +} + + +void SizeStatistics::__set_unencoded_byte_array_data_bytes(const int64_t val) { + this->unencoded_byte_array_data_bytes = val; +__isset.unencoded_byte_array_data_bytes = true; +} + +void SizeStatistics::__set_repetition_level_histogram(const std::vector & val) { + this->repetition_level_histogram = val; +__isset.repetition_level_histogram = true; +} + +void SizeStatistics::__set_definition_level_histogram(const std::vector & val) { + this->definition_level_histogram = val; +__isset.definition_level_histogram = true; +} +std::ostream& operator<<(std::ostream& out, const SizeStatistics& obj) +{ + obj.printTo(out); + return out; +} + + +uint32_t SizeStatistics::read(::apache::thrift::protocol::TProtocol* iprot) { + + ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot); + uint32_t xfer = 0; + std::string fname; + ::apache::thrift::protocol::TType ftype; + int16_t fid; + + xfer += iprot->readStructBegin(fname); + + using ::apache::thrift::protocol::TProtocolException; + + + while (true) + { + xfer += iprot->readFieldBegin(fname, ftype, fid); + if (ftype == ::apache::thrift::protocol::T_STOP) { + break; + } + switch (fid) + { + case 1: + if (ftype == ::apache::thrift::protocol::T_I64) { + xfer += iprot->readI64(this->unencoded_byte_array_data_bytes); + this->__isset.unencoded_byte_array_data_bytes = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 2: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->repetition_level_histogram.clear(); + uint32_t _size0; + ::apache::thrift::protocol::TType _etype3; + xfer += iprot->readListBegin(_etype3, _size0); + this->repetition_level_histogram.resize(_size0); + uint32_t _i4; + for (_i4 = 0; _i4 < _size0; ++_i4) + { + xfer += iprot->readI64(this->repetition_level_histogram[_i4]); + } + xfer += iprot->readListEnd(); + } + this->__isset.repetition_level_histogram = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 3: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->definition_level_histogram.clear(); + uint32_t _size5; + ::apache::thrift::protocol::TType _etype8; + xfer += iprot->readListBegin(_etype8, _size5); + this->definition_level_histogram.resize(_size5); + uint32_t _i9; + for (_i9 = 0; _i9 < _size5; ++_i9) + { + xfer += iprot->readI64(this->definition_level_histogram[_i9]); + } + xfer += iprot->readListEnd(); + } + this->__isset.definition_level_histogram = true; + } else { + xfer += iprot->skip(ftype); + } + break; + default: + xfer += iprot->skip(ftype); + break; + } + xfer += iprot->readFieldEnd(); + } + + xfer += iprot->readStructEnd(); + + return xfer; +} + +uint32_t SizeStatistics::write(::apache::thrift::protocol::TProtocol* oprot) const { + uint32_t xfer = 0; + ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot); + xfer += oprot->writeStructBegin("SizeStatistics"); + + if (this->__isset.unencoded_byte_array_data_bytes) { + xfer += oprot->writeFieldBegin("unencoded_byte_array_data_bytes", ::apache::thrift::protocol::T_I64, 1); + xfer += oprot->writeI64(this->unencoded_byte_array_data_bytes); + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.repetition_level_histogram) { + xfer += oprot->writeFieldBegin("repetition_level_histogram", ::apache::thrift::protocol::T_LIST, 2); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->repetition_level_histogram.size())); + std::vector ::const_iterator _iter10; + for (_iter10 = this->repetition_level_histogram.begin(); _iter10 != this->repetition_level_histogram.end(); ++_iter10) + { + xfer += oprot->writeI64((*_iter10)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.definition_level_histogram) { + xfer += oprot->writeFieldBegin("definition_level_histogram", ::apache::thrift::protocol::T_LIST, 3); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->definition_level_histogram.size())); + std::vector ::const_iterator _iter11; + for (_iter11 = this->definition_level_histogram.begin(); _iter11 != this->definition_level_histogram.end(); ++_iter11) + { + xfer += oprot->writeI64((*_iter11)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + xfer += oprot->writeFieldStop(); + xfer += oprot->writeStructEnd(); + return xfer; +} + +void swap(SizeStatistics &a, SizeStatistics &b) { + using ::std::swap; + swap(a.unencoded_byte_array_data_bytes, b.unencoded_byte_array_data_bytes); + swap(a.repetition_level_histogram, b.repetition_level_histogram); + swap(a.definition_level_histogram, b.definition_level_histogram); + swap(a.__isset, b.__isset); +} + +SizeStatistics::SizeStatistics(const SizeStatistics& other12) { + unencoded_byte_array_data_bytes = other12.unencoded_byte_array_data_bytes; + repetition_level_histogram = other12.repetition_level_histogram; + definition_level_histogram = other12.definition_level_histogram; + __isset = other12.__isset; +} +SizeStatistics::SizeStatistics(SizeStatistics&& other13) noexcept { + unencoded_byte_array_data_bytes = other13.unencoded_byte_array_data_bytes; + repetition_level_histogram = std::move(other13.repetition_level_histogram); + definition_level_histogram = std::move(other13.definition_level_histogram); + __isset = other13.__isset; +} +SizeStatistics& SizeStatistics::operator=(const SizeStatistics& other14) { + unencoded_byte_array_data_bytes = other14.unencoded_byte_array_data_bytes; + repetition_level_histogram = other14.repetition_level_histogram; + definition_level_histogram = other14.definition_level_histogram; + __isset = other14.__isset; + return *this; +} +SizeStatistics& SizeStatistics::operator=(SizeStatistics&& other15) noexcept { + unencoded_byte_array_data_bytes = other15.unencoded_byte_array_data_bytes; + repetition_level_histogram = std::move(other15.repetition_level_histogram); + definition_level_histogram = std::move(other15.definition_level_histogram); + __isset = other15.__isset; + return *this; +} +void SizeStatistics::printTo(std::ostream& out) const { + using ::apache::thrift::to_string; + out << "SizeStatistics("; + out << "unencoded_byte_array_data_bytes="; (__isset.unencoded_byte_array_data_bytes ? (out << to_string(unencoded_byte_array_data_bytes)) : (out << "")); + out << ", " << "repetition_level_histogram="; (__isset.repetition_level_histogram ? (out << to_string(repetition_level_histogram)) : (out << "")); + out << ", " << "definition_level_histogram="; (__isset.definition_level_histogram ? (out << to_string(definition_level_histogram)) : (out << "")); + out << ")"; +} + + Statistics::~Statistics() noexcept { } @@ -648,6 +839,16 @@ void Statistics::__set_min_value(const std::string& val) { this->min_value = val; __isset.min_value = true; } + +void Statistics::__set_is_max_value_exact(const bool val) { + this->is_max_value_exact = val; +__isset.is_max_value_exact = true; +} + +void Statistics::__set_is_min_value_exact(const bool val) { + this->is_min_value_exact = val; +__isset.is_min_value_exact = true; +} std::ostream& operator<<(std::ostream& out, const Statistics& obj) { obj.printTo(out); @@ -724,6 +925,22 @@ uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 7: + if (ftype == ::apache::thrift::protocol::T_BOOL) { + xfer += iprot->readBool(this->is_max_value_exact); + this->__isset.is_max_value_exact = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 8: + if (ftype == ::apache::thrift::protocol::T_BOOL) { + xfer += iprot->readBool(this->is_min_value_exact); + this->__isset.is_min_value_exact = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -771,6 +988,16 @@ uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeBinary(this->min_value); xfer += oprot->writeFieldEnd(); } + if (this->__isset.is_max_value_exact) { + xfer += oprot->writeFieldBegin("is_max_value_exact", ::apache::thrift::protocol::T_BOOL, 7); + xfer += oprot->writeBool(this->is_max_value_exact); + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.is_min_value_exact) { + xfer += oprot->writeFieldBegin("is_min_value_exact", ::apache::thrift::protocol::T_BOOL, 8); + xfer += oprot->writeBool(this->is_min_value_exact); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -784,45 +1011,55 @@ void swap(Statistics &a, Statistics &b) { swap(a.distinct_count, b.distinct_count); swap(a.max_value, b.max_value); swap(a.min_value, b.min_value); + swap(a.is_max_value_exact, b.is_max_value_exact); + swap(a.is_min_value_exact, b.is_min_value_exact); swap(a.__isset, b.__isset); } -Statistics::Statistics(const Statistics& other0) { - max = other0.max; - min = other0.min; - null_count = other0.null_count; - distinct_count = other0.distinct_count; - max_value = other0.max_value; - min_value = other0.min_value; - __isset = other0.__isset; -} -Statistics::Statistics(Statistics&& other1) noexcept { - max = std::move(other1.max); - min = std::move(other1.min); - null_count = other1.null_count; - distinct_count = other1.distinct_count; - max_value = std::move(other1.max_value); - min_value = std::move(other1.min_value); - __isset = other1.__isset; -} -Statistics& Statistics::operator=(const Statistics& other2) { - max = other2.max; - min = other2.min; - null_count = other2.null_count; - distinct_count = other2.distinct_count; - max_value = other2.max_value; - min_value = other2.min_value; - __isset = other2.__isset; +Statistics::Statistics(const Statistics& other16) { + max = other16.max; + min = other16.min; + null_count = other16.null_count; + distinct_count = other16.distinct_count; + max_value = other16.max_value; + min_value = other16.min_value; + is_max_value_exact = other16.is_max_value_exact; + is_min_value_exact = other16.is_min_value_exact; + __isset = other16.__isset; +} +Statistics::Statistics(Statistics&& other17) noexcept { + max = std::move(other17.max); + min = std::move(other17.min); + null_count = other17.null_count; + distinct_count = other17.distinct_count; + max_value = std::move(other17.max_value); + min_value = std::move(other17.min_value); + is_max_value_exact = other17.is_max_value_exact; + is_min_value_exact = other17.is_min_value_exact; + __isset = other17.__isset; +} +Statistics& Statistics::operator=(const Statistics& other18) { + max = other18.max; + min = other18.min; + null_count = other18.null_count; + distinct_count = other18.distinct_count; + max_value = other18.max_value; + min_value = other18.min_value; + is_max_value_exact = other18.is_max_value_exact; + is_min_value_exact = other18.is_min_value_exact; + __isset = other18.__isset; return *this; } -Statistics& Statistics::operator=(Statistics&& other3) noexcept { - max = std::move(other3.max); - min = std::move(other3.min); - null_count = other3.null_count; - distinct_count = other3.distinct_count; - max_value = std::move(other3.max_value); - min_value = std::move(other3.min_value); - __isset = other3.__isset; +Statistics& Statistics::operator=(Statistics&& other19) noexcept { + max = std::move(other19.max); + min = std::move(other19.min); + null_count = other19.null_count; + distinct_count = other19.distinct_count; + max_value = std::move(other19.max_value); + min_value = std::move(other19.min_value); + is_max_value_exact = other19.is_max_value_exact; + is_min_value_exact = other19.is_min_value_exact; + __isset = other19.__isset; return *this; } void Statistics::printTo(std::ostream& out) const { @@ -834,6 +1071,8 @@ void Statistics::printTo(std::ostream& out) const { out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "")); out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "")); out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "")); + out << ", " << "is_max_value_exact="; (__isset.is_max_value_exact ? (out << to_string(is_max_value_exact)) : (out << "")); + out << ", " << "is_min_value_exact="; (__isset.is_min_value_exact ? (out << to_string(is_min_value_exact)) : (out << "")); out << ")"; } @@ -892,18 +1131,18 @@ void swap(StringType &a, StringType &b) { (void) b; } -StringType::StringType(const StringType& other4) noexcept { - (void) other4; +StringType::StringType(const StringType& other20) noexcept { + (void) other20; } -StringType::StringType(StringType&& other5) noexcept { - (void) other5; +StringType::StringType(StringType&& other21) noexcept { + (void) other21; } -StringType& StringType::operator=(const StringType& other6) noexcept { - (void) other6; +StringType& StringType::operator=(const StringType& other22) noexcept { + (void) other22; return *this; } -StringType& StringType::operator=(StringType&& other7) noexcept { - (void) other7; +StringType& StringType::operator=(StringType&& other23) noexcept { + (void) other23; return *this; } void StringType::printTo(std::ostream& out) const { @@ -967,18 +1206,18 @@ void swap(UUIDType &a, UUIDType &b) { (void) b; } -UUIDType::UUIDType(const UUIDType& other8) noexcept { - (void) other8; +UUIDType::UUIDType(const UUIDType& other24) noexcept { + (void) other24; } -UUIDType::UUIDType(UUIDType&& other9) noexcept { - (void) other9; +UUIDType::UUIDType(UUIDType&& other25) noexcept { + (void) other25; } -UUIDType& UUIDType::operator=(const UUIDType& other10) noexcept { - (void) other10; +UUIDType& UUIDType::operator=(const UUIDType& other26) noexcept { + (void) other26; return *this; } -UUIDType& UUIDType::operator=(UUIDType&& other11) noexcept { - (void) other11; +UUIDType& UUIDType::operator=(UUIDType&& other27) noexcept { + (void) other27; return *this; } void UUIDType::printTo(std::ostream& out) const { @@ -1042,18 +1281,18 @@ void swap(MapType &a, MapType &b) { (void) b; } -MapType::MapType(const MapType& other12) noexcept { - (void) other12; +MapType::MapType(const MapType& other28) noexcept { + (void) other28; } -MapType::MapType(MapType&& other13) noexcept { - (void) other13; +MapType::MapType(MapType&& other29) noexcept { + (void) other29; } -MapType& MapType::operator=(const MapType& other14) noexcept { - (void) other14; +MapType& MapType::operator=(const MapType& other30) noexcept { + (void) other30; return *this; } -MapType& MapType::operator=(MapType&& other15) noexcept { - (void) other15; +MapType& MapType::operator=(MapType&& other31) noexcept { + (void) other31; return *this; } void MapType::printTo(std::ostream& out) const { @@ -1117,18 +1356,18 @@ void swap(ListType &a, ListType &b) { (void) b; } -ListType::ListType(const ListType& other16) noexcept { - (void) other16; +ListType::ListType(const ListType& other32) noexcept { + (void) other32; } -ListType::ListType(ListType&& other17) noexcept { - (void) other17; +ListType::ListType(ListType&& other33) noexcept { + (void) other33; } -ListType& ListType::operator=(const ListType& other18) noexcept { - (void) other18; +ListType& ListType::operator=(const ListType& other34) noexcept { + (void) other34; return *this; } -ListType& ListType::operator=(ListType&& other19) noexcept { - (void) other19; +ListType& ListType::operator=(ListType&& other35) noexcept { + (void) other35; return *this; } void ListType::printTo(std::ostream& out) const { @@ -1192,18 +1431,18 @@ void swap(EnumType &a, EnumType &b) { (void) b; } -EnumType::EnumType(const EnumType& other20) noexcept { - (void) other20; +EnumType::EnumType(const EnumType& other36) noexcept { + (void) other36; } -EnumType::EnumType(EnumType&& other21) noexcept { - (void) other21; +EnumType::EnumType(EnumType&& other37) noexcept { + (void) other37; } -EnumType& EnumType::operator=(const EnumType& other22) noexcept { - (void) other22; +EnumType& EnumType::operator=(const EnumType& other38) noexcept { + (void) other38; return *this; } -EnumType& EnumType::operator=(EnumType&& other23) noexcept { - (void) other23; +EnumType& EnumType::operator=(EnumType&& other39) noexcept { + (void) other39; return *this; } void EnumType::printTo(std::ostream& out) const { @@ -1267,18 +1506,18 @@ void swap(DateType &a, DateType &b) { (void) b; } -DateType::DateType(const DateType& other24) noexcept { - (void) other24; +DateType::DateType(const DateType& other40) noexcept { + (void) other40; } -DateType::DateType(DateType&& other25) noexcept { - (void) other25; +DateType::DateType(DateType&& other41) noexcept { + (void) other41; } -DateType& DateType::operator=(const DateType& other26) noexcept { - (void) other26; +DateType& DateType::operator=(const DateType& other42) noexcept { + (void) other42; return *this; } -DateType& DateType::operator=(DateType&& other27) noexcept { - (void) other27; +DateType& DateType::operator=(DateType&& other43) noexcept { + (void) other43; return *this; } void DateType::printTo(std::ostream& out) const { @@ -1342,18 +1581,18 @@ void swap(Float16Type &a, Float16Type &b) { (void) b; } -Float16Type::Float16Type(const Float16Type& other28) noexcept { - (void) other28; +Float16Type::Float16Type(const Float16Type& other44) noexcept { + (void) other44; } -Float16Type::Float16Type(Float16Type&& other29) noexcept { - (void) other29; +Float16Type::Float16Type(Float16Type&& other45) noexcept { + (void) other45; } -Float16Type& Float16Type::operator=(const Float16Type& other30) noexcept { - (void) other30; +Float16Type& Float16Type::operator=(const Float16Type& other46) noexcept { + (void) other46; return *this; } -Float16Type& Float16Type::operator=(Float16Type&& other31) noexcept { - (void) other31; +Float16Type& Float16Type::operator=(Float16Type&& other47) noexcept { + (void) other47; return *this; } void Float16Type::printTo(std::ostream& out) const { @@ -1417,18 +1656,18 @@ void swap(NullType &a, NullType &b) { (void) b; } -NullType::NullType(const NullType& other32) noexcept { - (void) other32; +NullType::NullType(const NullType& other48) noexcept { + (void) other48; } -NullType::NullType(NullType&& other33) noexcept { - (void) other33; +NullType::NullType(NullType&& other49) noexcept { + (void) other49; } -NullType& NullType::operator=(const NullType& other34) noexcept { - (void) other34; +NullType& NullType::operator=(const NullType& other50) noexcept { + (void) other50; return *this; } -NullType& NullType::operator=(NullType&& other35) noexcept { - (void) other35; +NullType& NullType::operator=(NullType&& other51) noexcept { + (void) other51; return *this; } void NullType::printTo(std::ostream& out) const { @@ -1535,22 +1774,22 @@ void swap(DecimalType &a, DecimalType &b) { swap(a.precision, b.precision); } -DecimalType::DecimalType(const DecimalType& other36) noexcept { - scale = other36.scale; - precision = other36.precision; +DecimalType::DecimalType(const DecimalType& other52) noexcept { + scale = other52.scale; + precision = other52.precision; } -DecimalType::DecimalType(DecimalType&& other37) noexcept { - scale = other37.scale; - precision = other37.precision; +DecimalType::DecimalType(DecimalType&& other53) noexcept { + scale = other53.scale; + precision = other53.precision; } -DecimalType& DecimalType::operator=(const DecimalType& other38) noexcept { - scale = other38.scale; - precision = other38.precision; +DecimalType& DecimalType::operator=(const DecimalType& other54) noexcept { + scale = other54.scale; + precision = other54.precision; return *this; } -DecimalType& DecimalType::operator=(DecimalType&& other39) noexcept { - scale = other39.scale; - precision = other39.precision; +DecimalType& DecimalType::operator=(DecimalType&& other55) noexcept { + scale = other55.scale; + precision = other55.precision; return *this; } void DecimalType::printTo(std::ostream& out) const { @@ -1616,18 +1855,18 @@ void swap(MilliSeconds &a, MilliSeconds &b) { (void) b; } -MilliSeconds::MilliSeconds(const MilliSeconds& other40) noexcept { - (void) other40; +MilliSeconds::MilliSeconds(const MilliSeconds& other56) noexcept { + (void) other56; } -MilliSeconds::MilliSeconds(MilliSeconds&& other41) noexcept { - (void) other41; +MilliSeconds::MilliSeconds(MilliSeconds&& other57) noexcept { + (void) other57; } -MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other42) noexcept { - (void) other42; +MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other58) noexcept { + (void) other58; return *this; } -MilliSeconds& MilliSeconds::operator=(MilliSeconds&& other43) noexcept { - (void) other43; +MilliSeconds& MilliSeconds::operator=(MilliSeconds&& other59) noexcept { + (void) other59; return *this; } void MilliSeconds::printTo(std::ostream& out) const { @@ -1691,18 +1930,18 @@ void swap(MicroSeconds &a, MicroSeconds &b) { (void) b; } -MicroSeconds::MicroSeconds(const MicroSeconds& other44) noexcept { - (void) other44; +MicroSeconds::MicroSeconds(const MicroSeconds& other60) noexcept { + (void) other60; } -MicroSeconds::MicroSeconds(MicroSeconds&& other45) noexcept { - (void) other45; +MicroSeconds::MicroSeconds(MicroSeconds&& other61) noexcept { + (void) other61; } -MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other46) noexcept { - (void) other46; +MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other62) noexcept { + (void) other62; return *this; } -MicroSeconds& MicroSeconds::operator=(MicroSeconds&& other47) noexcept { - (void) other47; +MicroSeconds& MicroSeconds::operator=(MicroSeconds&& other63) noexcept { + (void) other63; return *this; } void MicroSeconds::printTo(std::ostream& out) const { @@ -1766,18 +2005,18 @@ void swap(NanoSeconds &a, NanoSeconds &b) { (void) b; } -NanoSeconds::NanoSeconds(const NanoSeconds& other48) noexcept { - (void) other48; +NanoSeconds::NanoSeconds(const NanoSeconds& other64) noexcept { + (void) other64; } -NanoSeconds::NanoSeconds(NanoSeconds&& other49) noexcept { - (void) other49; +NanoSeconds::NanoSeconds(NanoSeconds&& other65) noexcept { + (void) other65; } -NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other50) noexcept { - (void) other50; +NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other66) noexcept { + (void) other66; return *this; } -NanoSeconds& NanoSeconds::operator=(NanoSeconds&& other51) noexcept { - (void) other51; +NanoSeconds& NanoSeconds::operator=(NanoSeconds&& other67) noexcept { + (void) other67; return *this; } void NanoSeconds::printTo(std::ostream& out) const { @@ -1902,30 +2141,30 @@ void swap(TimeUnit &a, TimeUnit &b) { swap(a.__isset, b.__isset); } -TimeUnit::TimeUnit(const TimeUnit& other52) noexcept { - MILLIS = other52.MILLIS; - MICROS = other52.MICROS; - NANOS = other52.NANOS; - __isset = other52.__isset; +TimeUnit::TimeUnit(const TimeUnit& other68) noexcept { + MILLIS = other68.MILLIS; + MICROS = other68.MICROS; + NANOS = other68.NANOS; + __isset = other68.__isset; } -TimeUnit::TimeUnit(TimeUnit&& other53) noexcept { - MILLIS = std::move(other53.MILLIS); - MICROS = std::move(other53.MICROS); - NANOS = std::move(other53.NANOS); - __isset = other53.__isset; +TimeUnit::TimeUnit(TimeUnit&& other69) noexcept { + MILLIS = std::move(other69.MILLIS); + MICROS = std::move(other69.MICROS); + NANOS = std::move(other69.NANOS); + __isset = other69.__isset; } -TimeUnit& TimeUnit::operator=(const TimeUnit& other54) noexcept { - MILLIS = other54.MILLIS; - MICROS = other54.MICROS; - NANOS = other54.NANOS; - __isset = other54.__isset; +TimeUnit& TimeUnit::operator=(const TimeUnit& other70) noexcept { + MILLIS = other70.MILLIS; + MICROS = other70.MICROS; + NANOS = other70.NANOS; + __isset = other70.__isset; return *this; } -TimeUnit& TimeUnit::operator=(TimeUnit&& other55) noexcept { - MILLIS = std::move(other55.MILLIS); - MICROS = std::move(other55.MICROS); - NANOS = std::move(other55.NANOS); - __isset = other55.__isset; +TimeUnit& TimeUnit::operator=(TimeUnit&& other71) noexcept { + MILLIS = std::move(other71.MILLIS); + MICROS = std::move(other71.MICROS); + NANOS = std::move(other71.NANOS); + __isset = other71.__isset; return *this; } void TimeUnit::printTo(std::ostream& out) const { @@ -2035,22 +2274,22 @@ void swap(TimestampType &a, TimestampType &b) { swap(a.unit, b.unit); } -TimestampType::TimestampType(const TimestampType& other56) noexcept { - isAdjustedToUTC = other56.isAdjustedToUTC; - unit = other56.unit; +TimestampType::TimestampType(const TimestampType& other72) noexcept { + isAdjustedToUTC = other72.isAdjustedToUTC; + unit = other72.unit; } -TimestampType::TimestampType(TimestampType&& other57) noexcept { - isAdjustedToUTC = other57.isAdjustedToUTC; - unit = std::move(other57.unit); +TimestampType::TimestampType(TimestampType&& other73) noexcept { + isAdjustedToUTC = other73.isAdjustedToUTC; + unit = std::move(other73.unit); } -TimestampType& TimestampType::operator=(const TimestampType& other58) noexcept { - isAdjustedToUTC = other58.isAdjustedToUTC; - unit = other58.unit; +TimestampType& TimestampType::operator=(const TimestampType& other74) noexcept { + isAdjustedToUTC = other74.isAdjustedToUTC; + unit = other74.unit; return *this; } -TimestampType& TimestampType::operator=(TimestampType&& other59) noexcept { - isAdjustedToUTC = other59.isAdjustedToUTC; - unit = std::move(other59.unit); +TimestampType& TimestampType::operator=(TimestampType&& other75) noexcept { + isAdjustedToUTC = other75.isAdjustedToUTC; + unit = std::move(other75.unit); return *this; } void TimestampType::printTo(std::ostream& out) const { @@ -2159,22 +2398,22 @@ void swap(TimeType &a, TimeType &b) { swap(a.unit, b.unit); } -TimeType::TimeType(const TimeType& other60) noexcept { - isAdjustedToUTC = other60.isAdjustedToUTC; - unit = other60.unit; +TimeType::TimeType(const TimeType& other76) noexcept { + isAdjustedToUTC = other76.isAdjustedToUTC; + unit = other76.unit; } -TimeType::TimeType(TimeType&& other61) noexcept { - isAdjustedToUTC = other61.isAdjustedToUTC; - unit = std::move(other61.unit); +TimeType::TimeType(TimeType&& other77) noexcept { + isAdjustedToUTC = other77.isAdjustedToUTC; + unit = std::move(other77.unit); } -TimeType& TimeType::operator=(const TimeType& other62) noexcept { - isAdjustedToUTC = other62.isAdjustedToUTC; - unit = other62.unit; +TimeType& TimeType::operator=(const TimeType& other78) noexcept { + isAdjustedToUTC = other78.isAdjustedToUTC; + unit = other78.unit; return *this; } -TimeType& TimeType::operator=(TimeType&& other63) noexcept { - isAdjustedToUTC = other63.isAdjustedToUTC; - unit = std::move(other63.unit); +TimeType& TimeType::operator=(TimeType&& other79) noexcept { + isAdjustedToUTC = other79.isAdjustedToUTC; + unit = std::move(other79.unit); return *this; } void TimeType::printTo(std::ostream& out) const { @@ -2283,22 +2522,22 @@ void swap(IntType &a, IntType &b) { swap(a.isSigned, b.isSigned); } -IntType::IntType(const IntType& other64) noexcept { - bitWidth = other64.bitWidth; - isSigned = other64.isSigned; +IntType::IntType(const IntType& other80) noexcept { + bitWidth = other80.bitWidth; + isSigned = other80.isSigned; } -IntType::IntType(IntType&& other65) noexcept { - bitWidth = other65.bitWidth; - isSigned = other65.isSigned; +IntType::IntType(IntType&& other81) noexcept { + bitWidth = other81.bitWidth; + isSigned = other81.isSigned; } -IntType& IntType::operator=(const IntType& other66) noexcept { - bitWidth = other66.bitWidth; - isSigned = other66.isSigned; +IntType& IntType::operator=(const IntType& other82) noexcept { + bitWidth = other82.bitWidth; + isSigned = other82.isSigned; return *this; } -IntType& IntType::operator=(IntType&& other67) noexcept { - bitWidth = other67.bitWidth; - isSigned = other67.isSigned; +IntType& IntType::operator=(IntType&& other83) noexcept { + bitWidth = other83.bitWidth; + isSigned = other83.isSigned; return *this; } void IntType::printTo(std::ostream& out) const { @@ -2364,18 +2603,18 @@ void swap(JsonType &a, JsonType &b) { (void) b; } -JsonType::JsonType(const JsonType& other68) noexcept { - (void) other68; +JsonType::JsonType(const JsonType& other84) noexcept { + (void) other84; } -JsonType::JsonType(JsonType&& other69) noexcept { - (void) other69; +JsonType::JsonType(JsonType&& other85) noexcept { + (void) other85; } -JsonType& JsonType::operator=(const JsonType& other70) noexcept { - (void) other70; +JsonType& JsonType::operator=(const JsonType& other86) noexcept { + (void) other86; return *this; } -JsonType& JsonType::operator=(JsonType&& other71) noexcept { - (void) other71; +JsonType& JsonType::operator=(JsonType&& other87) noexcept { + (void) other87; return *this; } void JsonType::printTo(std::ostream& out) const { @@ -2439,18 +2678,18 @@ void swap(BsonType &a, BsonType &b) { (void) b; } -BsonType::BsonType(const BsonType& other72) noexcept { - (void) other72; +BsonType::BsonType(const BsonType& other88) noexcept { + (void) other88; } -BsonType::BsonType(BsonType&& other73) noexcept { - (void) other73; +BsonType::BsonType(BsonType&& other89) noexcept { + (void) other89; } -BsonType& BsonType::operator=(const BsonType& other74) noexcept { - (void) other74; +BsonType& BsonType::operator=(const BsonType& other90) noexcept { + (void) other90; return *this; } -BsonType& BsonType::operator=(BsonType&& other75) noexcept { - (void) other75; +BsonType& BsonType::operator=(BsonType&& other91) noexcept { + (void) other91; return *this; } void BsonType::printTo(std::ostream& out) const { @@ -2784,74 +3023,74 @@ void swap(LogicalType &a, LogicalType &b) { swap(a.__isset, b.__isset); } -LogicalType::LogicalType(const LogicalType& other76) noexcept { - STRING = other76.STRING; - MAP = other76.MAP; - LIST = other76.LIST; - ENUM = other76.ENUM; - DECIMAL = other76.DECIMAL; - DATE = other76.DATE; - TIME = other76.TIME; - TIMESTAMP = other76.TIMESTAMP; - INTEGER = other76.INTEGER; - UNKNOWN = other76.UNKNOWN; - JSON = other76.JSON; - BSON = other76.BSON; - UUID = other76.UUID; - FLOAT16 = other76.FLOAT16; - __isset = other76.__isset; -} -LogicalType::LogicalType(LogicalType&& other77) noexcept { - STRING = std::move(other77.STRING); - MAP = std::move(other77.MAP); - LIST = std::move(other77.LIST); - ENUM = std::move(other77.ENUM); - DECIMAL = std::move(other77.DECIMAL); - DATE = std::move(other77.DATE); - TIME = std::move(other77.TIME); - TIMESTAMP = std::move(other77.TIMESTAMP); - INTEGER = std::move(other77.INTEGER); - UNKNOWN = std::move(other77.UNKNOWN); - JSON = std::move(other77.JSON); - BSON = std::move(other77.BSON); - UUID = std::move(other77.UUID); - FLOAT16 = std::move(other77.FLOAT16); - __isset = other77.__isset; -} -LogicalType& LogicalType::operator=(const LogicalType& other78) noexcept { - STRING = other78.STRING; - MAP = other78.MAP; - LIST = other78.LIST; - ENUM = other78.ENUM; - DECIMAL = other78.DECIMAL; - DATE = other78.DATE; - TIME = other78.TIME; - TIMESTAMP = other78.TIMESTAMP; - INTEGER = other78.INTEGER; - UNKNOWN = other78.UNKNOWN; - JSON = other78.JSON; - BSON = other78.BSON; - UUID = other78.UUID; - FLOAT16 = other78.FLOAT16; - __isset = other78.__isset; +LogicalType::LogicalType(const LogicalType& other92) noexcept { + STRING = other92.STRING; + MAP = other92.MAP; + LIST = other92.LIST; + ENUM = other92.ENUM; + DECIMAL = other92.DECIMAL; + DATE = other92.DATE; + TIME = other92.TIME; + TIMESTAMP = other92.TIMESTAMP; + INTEGER = other92.INTEGER; + UNKNOWN = other92.UNKNOWN; + JSON = other92.JSON; + BSON = other92.BSON; + UUID = other92.UUID; + FLOAT16 = other92.FLOAT16; + __isset = other92.__isset; +} +LogicalType::LogicalType(LogicalType&& other93) noexcept { + STRING = std::move(other93.STRING); + MAP = std::move(other93.MAP); + LIST = std::move(other93.LIST); + ENUM = std::move(other93.ENUM); + DECIMAL = std::move(other93.DECIMAL); + DATE = std::move(other93.DATE); + TIME = std::move(other93.TIME); + TIMESTAMP = std::move(other93.TIMESTAMP); + INTEGER = std::move(other93.INTEGER); + UNKNOWN = std::move(other93.UNKNOWN); + JSON = std::move(other93.JSON); + BSON = std::move(other93.BSON); + UUID = std::move(other93.UUID); + FLOAT16 = std::move(other93.FLOAT16); + __isset = other93.__isset; +} +LogicalType& LogicalType::operator=(const LogicalType& other94) noexcept { + STRING = other94.STRING; + MAP = other94.MAP; + LIST = other94.LIST; + ENUM = other94.ENUM; + DECIMAL = other94.DECIMAL; + DATE = other94.DATE; + TIME = other94.TIME; + TIMESTAMP = other94.TIMESTAMP; + INTEGER = other94.INTEGER; + UNKNOWN = other94.UNKNOWN; + JSON = other94.JSON; + BSON = other94.BSON; + UUID = other94.UUID; + FLOAT16 = other94.FLOAT16; + __isset = other94.__isset; return *this; } -LogicalType& LogicalType::operator=(LogicalType&& other79) noexcept { - STRING = std::move(other79.STRING); - MAP = std::move(other79.MAP); - LIST = std::move(other79.LIST); - ENUM = std::move(other79.ENUM); - DECIMAL = std::move(other79.DECIMAL); - DATE = std::move(other79.DATE); - TIME = std::move(other79.TIME); - TIMESTAMP = std::move(other79.TIMESTAMP); - INTEGER = std::move(other79.INTEGER); - UNKNOWN = std::move(other79.UNKNOWN); - JSON = std::move(other79.JSON); - BSON = std::move(other79.BSON); - UUID = std::move(other79.UUID); - FLOAT16 = std::move(other79.FLOAT16); - __isset = other79.__isset; +LogicalType& LogicalType::operator=(LogicalType&& other95) noexcept { + STRING = std::move(other95.STRING); + MAP = std::move(other95.MAP); + LIST = std::move(other95.LIST); + ENUM = std::move(other95.ENUM); + DECIMAL = std::move(other95.DECIMAL); + DATE = std::move(other95.DATE); + TIME = std::move(other95.TIME); + TIMESTAMP = std::move(other95.TIMESTAMP); + INTEGER = std::move(other95.INTEGER); + UNKNOWN = std::move(other95.UNKNOWN); + JSON = std::move(other95.JSON); + BSON = std::move(other95.BSON); + UUID = std::move(other95.UUID); + FLOAT16 = std::move(other95.FLOAT16); + __isset = other95.__isset; return *this; } void LogicalType::printTo(std::ostream& out) const { @@ -2958,9 +3197,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast80; - xfer += iprot->readI32(ecast80); - this->type = static_cast(ecast80); + int32_t ecast96; + xfer += iprot->readI32(ecast96); + this->type = static_cast(ecast96); this->__isset.type = true; } else { xfer += iprot->skip(ftype); @@ -2976,9 +3215,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 3: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast81; - xfer += iprot->readI32(ecast81); - this->repetition_type = static_cast(ecast81); + int32_t ecast97; + xfer += iprot->readI32(ecast97); + this->repetition_type = static_cast(ecast97); this->__isset.repetition_type = true; } else { xfer += iprot->skip(ftype); @@ -3002,9 +3241,9 @@ uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 6: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast82; - xfer += iprot->readI32(ecast82); - this->converted_type = static_cast(ecast82); + int32_t ecast98; + xfer += iprot->readI32(ecast98); + this->converted_type = static_cast(ecast98); this->__isset.converted_type = true; } else { xfer += iprot->skip(ftype); @@ -3130,58 +3369,58 @@ void swap(SchemaElement &a, SchemaElement &b) { swap(a.__isset, b.__isset); } -SchemaElement::SchemaElement(const SchemaElement& other83) { - type = other83.type; - type_length = other83.type_length; - repetition_type = other83.repetition_type; - name = other83.name; - num_children = other83.num_children; - converted_type = other83.converted_type; - scale = other83.scale; - precision = other83.precision; - field_id = other83.field_id; - logicalType = other83.logicalType; - __isset = other83.__isset; -} -SchemaElement::SchemaElement(SchemaElement&& other84) noexcept { - type = other84.type; - type_length = other84.type_length; - repetition_type = other84.repetition_type; - name = std::move(other84.name); - num_children = other84.num_children; - converted_type = other84.converted_type; - scale = other84.scale; - precision = other84.precision; - field_id = other84.field_id; - logicalType = std::move(other84.logicalType); - __isset = other84.__isset; -} -SchemaElement& SchemaElement::operator=(const SchemaElement& other85) { - type = other85.type; - type_length = other85.type_length; - repetition_type = other85.repetition_type; - name = other85.name; - num_children = other85.num_children; - converted_type = other85.converted_type; - scale = other85.scale; - precision = other85.precision; - field_id = other85.field_id; - logicalType = other85.logicalType; - __isset = other85.__isset; +SchemaElement::SchemaElement(const SchemaElement& other99) { + type = other99.type; + type_length = other99.type_length; + repetition_type = other99.repetition_type; + name = other99.name; + num_children = other99.num_children; + converted_type = other99.converted_type; + scale = other99.scale; + precision = other99.precision; + field_id = other99.field_id; + logicalType = other99.logicalType; + __isset = other99.__isset; +} +SchemaElement::SchemaElement(SchemaElement&& other100) noexcept { + type = other100.type; + type_length = other100.type_length; + repetition_type = other100.repetition_type; + name = std::move(other100.name); + num_children = other100.num_children; + converted_type = other100.converted_type; + scale = other100.scale; + precision = other100.precision; + field_id = other100.field_id; + logicalType = std::move(other100.logicalType); + __isset = other100.__isset; +} +SchemaElement& SchemaElement::operator=(const SchemaElement& other101) { + type = other101.type; + type_length = other101.type_length; + repetition_type = other101.repetition_type; + name = other101.name; + num_children = other101.num_children; + converted_type = other101.converted_type; + scale = other101.scale; + precision = other101.precision; + field_id = other101.field_id; + logicalType = other101.logicalType; + __isset = other101.__isset; return *this; } -SchemaElement& SchemaElement::operator=(SchemaElement&& other86) noexcept { - type = other86.type; - type_length = other86.type_length; - repetition_type = other86.repetition_type; - name = std::move(other86.name); - num_children = other86.num_children; - converted_type = other86.converted_type; - scale = other86.scale; - precision = other86.precision; - field_id = other86.field_id; - logicalType = std::move(other86.logicalType); - __isset = other86.__isset; +SchemaElement& SchemaElement::operator=(SchemaElement&& other102) noexcept { + type = other102.type; + type_length = other102.type_length; + repetition_type = other102.repetition_type; + name = std::move(other102.name); + num_children = other102.num_children; + converted_type = other102.converted_type; + scale = other102.scale; + precision = other102.precision; + field_id = other102.field_id; + logicalType = std::move(other102.logicalType); + __isset = other102.__isset; return *this; } void SchemaElement::printTo(std::ostream& out) const { @@ -3267,9 +3506,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast87; - xfer += iprot->readI32(ecast87); - this->encoding = static_cast(ecast87); + int32_t ecast103; + xfer += iprot->readI32(ecast103); + this->encoding = static_cast(ecast103); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3277,9 +3516,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 3: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast88; - xfer += iprot->readI32(ecast88); - this->definition_level_encoding = static_cast(ecast88); + int32_t ecast104; + xfer += iprot->readI32(ecast104); + this->definition_level_encoding = static_cast(ecast104); isset_definition_level_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3287,9 +3526,9 @@ uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast89; - xfer += iprot->readI32(ecast89); - this->repetition_level_encoding = static_cast(ecast89); + int32_t ecast105; + xfer += iprot->readI32(ecast105); + this->repetition_level_encoding = static_cast(ecast105); isset_repetition_level_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3364,38 +3603,38 @@ void swap(DataPageHeader &a, DataPageHeader &b) { swap(a.__isset, b.__isset); } -DataPageHeader::DataPageHeader(const DataPageHeader& other90) { - num_values = other90.num_values; - encoding = other90.encoding; - definition_level_encoding = other90.definition_level_encoding; - repetition_level_encoding = other90.repetition_level_encoding; - statistics = other90.statistics; - __isset = other90.__isset; -} -DataPageHeader::DataPageHeader(DataPageHeader&& other91) noexcept { - num_values = other91.num_values; - encoding = other91.encoding; - definition_level_encoding = other91.definition_level_encoding; - repetition_level_encoding = other91.repetition_level_encoding; - statistics = std::move(other91.statistics); - __isset = other91.__isset; -} -DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other92) { - num_values = other92.num_values; - encoding = other92.encoding; - definition_level_encoding = other92.definition_level_encoding; - repetition_level_encoding = other92.repetition_level_encoding; - statistics = other92.statistics; - __isset = other92.__isset; +DataPageHeader::DataPageHeader(const DataPageHeader& other106) { + num_values = other106.num_values; + encoding = other106.encoding; + definition_level_encoding = other106.definition_level_encoding; + repetition_level_encoding = other106.repetition_level_encoding; + statistics = other106.statistics; + __isset = other106.__isset; +} +DataPageHeader::DataPageHeader(DataPageHeader&& other107) noexcept { + num_values = other107.num_values; + encoding = other107.encoding; + definition_level_encoding = other107.definition_level_encoding; + repetition_level_encoding = other107.repetition_level_encoding; + statistics = std::move(other107.statistics); + __isset = other107.__isset; +} +DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other108) { + num_values = other108.num_values; + encoding = other108.encoding; + definition_level_encoding = other108.definition_level_encoding; + repetition_level_encoding = other108.repetition_level_encoding; + statistics = other108.statistics; + __isset = other108.__isset; return *this; } -DataPageHeader& DataPageHeader::operator=(DataPageHeader&& other93) noexcept { - num_values = other93.num_values; - encoding = other93.encoding; - definition_level_encoding = other93.definition_level_encoding; - repetition_level_encoding = other93.repetition_level_encoding; - statistics = std::move(other93.statistics); - __isset = other93.__isset; +DataPageHeader& DataPageHeader::operator=(DataPageHeader&& other109) noexcept { + num_values = other109.num_values; + encoding = other109.encoding; + definition_level_encoding = other109.definition_level_encoding; + repetition_level_encoding = other109.repetition_level_encoding; + statistics = std::move(other109.statistics); + __isset = other109.__isset; return *this; } void DataPageHeader::printTo(std::ostream& out) const { @@ -3464,18 +3703,18 @@ void swap(IndexPageHeader &a, IndexPageHeader &b) { (void) b; } -IndexPageHeader::IndexPageHeader(const IndexPageHeader& other94) noexcept { - (void) other94; +IndexPageHeader::IndexPageHeader(const IndexPageHeader& other110) noexcept { + (void) other110; } -IndexPageHeader::IndexPageHeader(IndexPageHeader&& other95) noexcept { - (void) other95; +IndexPageHeader::IndexPageHeader(IndexPageHeader&& other111) noexcept { + (void) other111; } -IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other96) noexcept { - (void) other96; +IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other112) noexcept { + (void) other112; return *this; } -IndexPageHeader& IndexPageHeader::operator=(IndexPageHeader&& other97) noexcept { - (void) other97; +IndexPageHeader& IndexPageHeader::operator=(IndexPageHeader&& other113) noexcept { + (void) other113; return *this; } void IndexPageHeader::printTo(std::ostream& out) const { @@ -3541,9 +3780,9 @@ uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast98; - xfer += iprot->readI32(ecast98); - this->encoding = static_cast(ecast98); + int32_t ecast114; + xfer += iprot->readI32(ecast114); + this->encoding = static_cast(ecast114); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3604,30 +3843,30 @@ void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) { swap(a.__isset, b.__isset); } -DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other99) noexcept { - num_values = other99.num_values; - encoding = other99.encoding; - is_sorted = other99.is_sorted; - __isset = other99.__isset; -} -DictionaryPageHeader::DictionaryPageHeader(DictionaryPageHeader&& other100) noexcept { - num_values = other100.num_values; - encoding = other100.encoding; - is_sorted = other100.is_sorted; - __isset = other100.__isset; +DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other115) noexcept { + num_values = other115.num_values; + encoding = other115.encoding; + is_sorted = other115.is_sorted; + __isset = other115.__isset; } -DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other101) noexcept { - num_values = other101.num_values; - encoding = other101.encoding; - is_sorted = other101.is_sorted; - __isset = other101.__isset; +DictionaryPageHeader::DictionaryPageHeader(DictionaryPageHeader&& other116) noexcept { + num_values = other116.num_values; + encoding = other116.encoding; + is_sorted = other116.is_sorted; + __isset = other116.__isset; +} +DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other117) noexcept { + num_values = other117.num_values; + encoding = other117.encoding; + is_sorted = other117.is_sorted; + __isset = other117.__isset; return *this; } -DictionaryPageHeader& DictionaryPageHeader::operator=(DictionaryPageHeader&& other102) noexcept { - num_values = other102.num_values; - encoding = other102.encoding; - is_sorted = other102.is_sorted; - __isset = other102.__isset; +DictionaryPageHeader& DictionaryPageHeader::operator=(DictionaryPageHeader&& other118) noexcept { + num_values = other118.num_values; + encoding = other118.encoding; + is_sorted = other118.is_sorted; + __isset = other118.__isset; return *this; } void DictionaryPageHeader::printTo(std::ostream& out) const { @@ -3737,9 +3976,9 @@ uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast103; - xfer += iprot->readI32(ecast103); - this->encoding = static_cast(ecast103); + int32_t ecast119; + xfer += iprot->readI32(ecast119); + this->encoding = static_cast(ecast119); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -3858,50 +4097,50 @@ void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) { swap(a.__isset, b.__isset); } -DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other104) { - num_values = other104.num_values; - num_nulls = other104.num_nulls; - num_rows = other104.num_rows; - encoding = other104.encoding; - definition_levels_byte_length = other104.definition_levels_byte_length; - repetition_levels_byte_length = other104.repetition_levels_byte_length; - is_compressed = other104.is_compressed; - statistics = other104.statistics; - __isset = other104.__isset; -} -DataPageHeaderV2::DataPageHeaderV2(DataPageHeaderV2&& other105) noexcept { - num_values = other105.num_values; - num_nulls = other105.num_nulls; - num_rows = other105.num_rows; - encoding = other105.encoding; - definition_levels_byte_length = other105.definition_levels_byte_length; - repetition_levels_byte_length = other105.repetition_levels_byte_length; - is_compressed = other105.is_compressed; - statistics = std::move(other105.statistics); - __isset = other105.__isset; -} -DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other106) { - num_values = other106.num_values; - num_nulls = other106.num_nulls; - num_rows = other106.num_rows; - encoding = other106.encoding; - definition_levels_byte_length = other106.definition_levels_byte_length; - repetition_levels_byte_length = other106.repetition_levels_byte_length; - is_compressed = other106.is_compressed; - statistics = other106.statistics; - __isset = other106.__isset; +DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other120) { + num_values = other120.num_values; + num_nulls = other120.num_nulls; + num_rows = other120.num_rows; + encoding = other120.encoding; + definition_levels_byte_length = other120.definition_levels_byte_length; + repetition_levels_byte_length = other120.repetition_levels_byte_length; + is_compressed = other120.is_compressed; + statistics = other120.statistics; + __isset = other120.__isset; +} +DataPageHeaderV2::DataPageHeaderV2(DataPageHeaderV2&& other121) noexcept { + num_values = other121.num_values; + num_nulls = other121.num_nulls; + num_rows = other121.num_rows; + encoding = other121.encoding; + definition_levels_byte_length = other121.definition_levels_byte_length; + repetition_levels_byte_length = other121.repetition_levels_byte_length; + is_compressed = other121.is_compressed; + statistics = std::move(other121.statistics); + __isset = other121.__isset; +} +DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other122) { + num_values = other122.num_values; + num_nulls = other122.num_nulls; + num_rows = other122.num_rows; + encoding = other122.encoding; + definition_levels_byte_length = other122.definition_levels_byte_length; + repetition_levels_byte_length = other122.repetition_levels_byte_length; + is_compressed = other122.is_compressed; + statistics = other122.statistics; + __isset = other122.__isset; return *this; } -DataPageHeaderV2& DataPageHeaderV2::operator=(DataPageHeaderV2&& other107) noexcept { - num_values = other107.num_values; - num_nulls = other107.num_nulls; - num_rows = other107.num_rows; - encoding = other107.encoding; - definition_levels_byte_length = other107.definition_levels_byte_length; - repetition_levels_byte_length = other107.repetition_levels_byte_length; - is_compressed = other107.is_compressed; - statistics = std::move(other107.statistics); - __isset = other107.__isset; +DataPageHeaderV2& DataPageHeaderV2::operator=(DataPageHeaderV2&& other123) noexcept { + num_values = other123.num_values; + num_nulls = other123.num_nulls; + num_rows = other123.num_rows; + encoding = other123.encoding; + definition_levels_byte_length = other123.definition_levels_byte_length; + repetition_levels_byte_length = other123.repetition_levels_byte_length; + is_compressed = other123.is_compressed; + statistics = std::move(other123.statistics); + __isset = other123.__isset; return *this; } void DataPageHeaderV2::printTo(std::ostream& out) const { @@ -3973,18 +4212,18 @@ void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) { (void) b; } -SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other108) noexcept { - (void) other108; +SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other124) noexcept { + (void) other124; } -SplitBlockAlgorithm::SplitBlockAlgorithm(SplitBlockAlgorithm&& other109) noexcept { - (void) other109; +SplitBlockAlgorithm::SplitBlockAlgorithm(SplitBlockAlgorithm&& other125) noexcept { + (void) other125; } -SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other110) noexcept { - (void) other110; +SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other126) noexcept { + (void) other126; return *this; } -SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(SplitBlockAlgorithm&& other111) noexcept { - (void) other111; +SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(SplitBlockAlgorithm&& other127) noexcept { + (void) other127; return *this; } void SplitBlockAlgorithm::printTo(std::ostream& out) const { @@ -4071,22 +4310,22 @@ void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) { swap(a.__isset, b.__isset); } -BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other112) noexcept { - BLOCK = other112.BLOCK; - __isset = other112.__isset; +BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other128) noexcept { + BLOCK = other128.BLOCK; + __isset = other128.__isset; } -BloomFilterAlgorithm::BloomFilterAlgorithm(BloomFilterAlgorithm&& other113) noexcept { - BLOCK = std::move(other113.BLOCK); - __isset = other113.__isset; +BloomFilterAlgorithm::BloomFilterAlgorithm(BloomFilterAlgorithm&& other129) noexcept { + BLOCK = std::move(other129.BLOCK); + __isset = other129.__isset; } -BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other114) noexcept { - BLOCK = other114.BLOCK; - __isset = other114.__isset; +BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other130) noexcept { + BLOCK = other130.BLOCK; + __isset = other130.__isset; return *this; } -BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(BloomFilterAlgorithm&& other115) noexcept { - BLOCK = std::move(other115.BLOCK); - __isset = other115.__isset; +BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(BloomFilterAlgorithm&& other131) noexcept { + BLOCK = std::move(other131.BLOCK); + __isset = other131.__isset; return *this; } void BloomFilterAlgorithm::printTo(std::ostream& out) const { @@ -4151,18 +4390,18 @@ void swap(XxHash &a, XxHash &b) { (void) b; } -XxHash::XxHash(const XxHash& other116) noexcept { - (void) other116; +XxHash::XxHash(const XxHash& other132) noexcept { + (void) other132; } -XxHash::XxHash(XxHash&& other117) noexcept { - (void) other117; +XxHash::XxHash(XxHash&& other133) noexcept { + (void) other133; } -XxHash& XxHash::operator=(const XxHash& other118) noexcept { - (void) other118; +XxHash& XxHash::operator=(const XxHash& other134) noexcept { + (void) other134; return *this; } -XxHash& XxHash::operator=(XxHash&& other119) noexcept { - (void) other119; +XxHash& XxHash::operator=(XxHash&& other135) noexcept { + (void) other135; return *this; } void XxHash::printTo(std::ostream& out) const { @@ -4249,22 +4488,22 @@ void swap(BloomFilterHash &a, BloomFilterHash &b) { swap(a.__isset, b.__isset); } -BloomFilterHash::BloomFilterHash(const BloomFilterHash& other120) noexcept { - XXHASH = other120.XXHASH; - __isset = other120.__isset; +BloomFilterHash::BloomFilterHash(const BloomFilterHash& other136) noexcept { + XXHASH = other136.XXHASH; + __isset = other136.__isset; } -BloomFilterHash::BloomFilterHash(BloomFilterHash&& other121) noexcept { - XXHASH = std::move(other121.XXHASH); - __isset = other121.__isset; +BloomFilterHash::BloomFilterHash(BloomFilterHash&& other137) noexcept { + XXHASH = std::move(other137.XXHASH); + __isset = other137.__isset; } -BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other122) noexcept { - XXHASH = other122.XXHASH; - __isset = other122.__isset; +BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other138) noexcept { + XXHASH = other138.XXHASH; + __isset = other138.__isset; return *this; } -BloomFilterHash& BloomFilterHash::operator=(BloomFilterHash&& other123) noexcept { - XXHASH = std::move(other123.XXHASH); - __isset = other123.__isset; +BloomFilterHash& BloomFilterHash::operator=(BloomFilterHash&& other139) noexcept { + XXHASH = std::move(other139.XXHASH); + __isset = other139.__isset; return *this; } void BloomFilterHash::printTo(std::ostream& out) const { @@ -4329,18 +4568,18 @@ void swap(Uncompressed &a, Uncompressed &b) { (void) b; } -Uncompressed::Uncompressed(const Uncompressed& other124) noexcept { - (void) other124; +Uncompressed::Uncompressed(const Uncompressed& other140) noexcept { + (void) other140; } -Uncompressed::Uncompressed(Uncompressed&& other125) noexcept { - (void) other125; +Uncompressed::Uncompressed(Uncompressed&& other141) noexcept { + (void) other141; } -Uncompressed& Uncompressed::operator=(const Uncompressed& other126) noexcept { - (void) other126; +Uncompressed& Uncompressed::operator=(const Uncompressed& other142) noexcept { + (void) other142; return *this; } -Uncompressed& Uncompressed::operator=(Uncompressed&& other127) noexcept { - (void) other127; +Uncompressed& Uncompressed::operator=(Uncompressed&& other143) noexcept { + (void) other143; return *this; } void Uncompressed::printTo(std::ostream& out) const { @@ -4427,22 +4666,22 @@ void swap(BloomFilterCompression &a, BloomFilterCompression &b) { swap(a.__isset, b.__isset); } -BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other128) noexcept { - UNCOMPRESSED = other128.UNCOMPRESSED; - __isset = other128.__isset; +BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other144) noexcept { + UNCOMPRESSED = other144.UNCOMPRESSED; + __isset = other144.__isset; } -BloomFilterCompression::BloomFilterCompression(BloomFilterCompression&& other129) noexcept { - UNCOMPRESSED = std::move(other129.UNCOMPRESSED); - __isset = other129.__isset; +BloomFilterCompression::BloomFilterCompression(BloomFilterCompression&& other145) noexcept { + UNCOMPRESSED = std::move(other145.UNCOMPRESSED); + __isset = other145.__isset; } -BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other130) noexcept { - UNCOMPRESSED = other130.UNCOMPRESSED; - __isset = other130.__isset; +BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other146) noexcept { + UNCOMPRESSED = other146.UNCOMPRESSED; + __isset = other146.__isset; return *this; } -BloomFilterCompression& BloomFilterCompression::operator=(BloomFilterCompression&& other131) noexcept { - UNCOMPRESSED = std::move(other131.UNCOMPRESSED); - __isset = other131.__isset; +BloomFilterCompression& BloomFilterCompression::operator=(BloomFilterCompression&& other147) noexcept { + UNCOMPRESSED = std::move(other147.UNCOMPRESSED); + __isset = other147.__isset; return *this; } void BloomFilterCompression::printTo(std::ostream& out) const { @@ -4590,30 +4829,30 @@ void swap(BloomFilterHeader &a, BloomFilterHeader &b) { swap(a.compression, b.compression); } -BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other132) noexcept { - numBytes = other132.numBytes; - algorithm = other132.algorithm; - hash = other132.hash; - compression = other132.compression; +BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other148) noexcept { + numBytes = other148.numBytes; + algorithm = other148.algorithm; + hash = other148.hash; + compression = other148.compression; } -BloomFilterHeader::BloomFilterHeader(BloomFilterHeader&& other133) noexcept { - numBytes = other133.numBytes; - algorithm = std::move(other133.algorithm); - hash = std::move(other133.hash); - compression = std::move(other133.compression); +BloomFilterHeader::BloomFilterHeader(BloomFilterHeader&& other149) noexcept { + numBytes = other149.numBytes; + algorithm = std::move(other149.algorithm); + hash = std::move(other149.hash); + compression = std::move(other149.compression); } -BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other134) noexcept { - numBytes = other134.numBytes; - algorithm = other134.algorithm; - hash = other134.hash; - compression = other134.compression; +BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other150) noexcept { + numBytes = other150.numBytes; + algorithm = other150.algorithm; + hash = other150.hash; + compression = other150.compression; return *this; } -BloomFilterHeader& BloomFilterHeader::operator=(BloomFilterHeader&& other135) noexcept { - numBytes = other135.numBytes; - algorithm = std::move(other135.algorithm); - hash = std::move(other135.hash); - compression = std::move(other135.compression); +BloomFilterHeader& BloomFilterHeader::operator=(BloomFilterHeader&& other151) noexcept { + numBytes = other151.numBytes; + algorithm = std::move(other151.algorithm); + hash = std::move(other151.hash); + compression = std::move(other151.compression); return *this; } void BloomFilterHeader::printTo(std::ostream& out) const { @@ -4700,9 +4939,9 @@ uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast136; - xfer += iprot->readI32(ecast136); - this->type = static_cast(ecast136); + int32_t ecast152; + xfer += iprot->readI32(ecast152); + this->type = static_cast(ecast152); isset_type = true; } else { xfer += iprot->skip(ftype); @@ -4842,50 +5081,50 @@ void swap(PageHeader &a, PageHeader &b) { swap(a.__isset, b.__isset); } -PageHeader::PageHeader(const PageHeader& other137) { - type = other137.type; - uncompressed_page_size = other137.uncompressed_page_size; - compressed_page_size = other137.compressed_page_size; - crc = other137.crc; - data_page_header = other137.data_page_header; - index_page_header = other137.index_page_header; - dictionary_page_header = other137.dictionary_page_header; - data_page_header_v2 = other137.data_page_header_v2; - __isset = other137.__isset; -} -PageHeader::PageHeader(PageHeader&& other138) noexcept { - type = other138.type; - uncompressed_page_size = other138.uncompressed_page_size; - compressed_page_size = other138.compressed_page_size; - crc = other138.crc; - data_page_header = std::move(other138.data_page_header); - index_page_header = std::move(other138.index_page_header); - dictionary_page_header = std::move(other138.dictionary_page_header); - data_page_header_v2 = std::move(other138.data_page_header_v2); - __isset = other138.__isset; -} -PageHeader& PageHeader::operator=(const PageHeader& other139) { - type = other139.type; - uncompressed_page_size = other139.uncompressed_page_size; - compressed_page_size = other139.compressed_page_size; - crc = other139.crc; - data_page_header = other139.data_page_header; - index_page_header = other139.index_page_header; - dictionary_page_header = other139.dictionary_page_header; - data_page_header_v2 = other139.data_page_header_v2; - __isset = other139.__isset; +PageHeader::PageHeader(const PageHeader& other153) { + type = other153.type; + uncompressed_page_size = other153.uncompressed_page_size; + compressed_page_size = other153.compressed_page_size; + crc = other153.crc; + data_page_header = other153.data_page_header; + index_page_header = other153.index_page_header; + dictionary_page_header = other153.dictionary_page_header; + data_page_header_v2 = other153.data_page_header_v2; + __isset = other153.__isset; +} +PageHeader::PageHeader(PageHeader&& other154) noexcept { + type = other154.type; + uncompressed_page_size = other154.uncompressed_page_size; + compressed_page_size = other154.compressed_page_size; + crc = other154.crc; + data_page_header = std::move(other154.data_page_header); + index_page_header = std::move(other154.index_page_header); + dictionary_page_header = std::move(other154.dictionary_page_header); + data_page_header_v2 = std::move(other154.data_page_header_v2); + __isset = other154.__isset; +} +PageHeader& PageHeader::operator=(const PageHeader& other155) { + type = other155.type; + uncompressed_page_size = other155.uncompressed_page_size; + compressed_page_size = other155.compressed_page_size; + crc = other155.crc; + data_page_header = other155.data_page_header; + index_page_header = other155.index_page_header; + dictionary_page_header = other155.dictionary_page_header; + data_page_header_v2 = other155.data_page_header_v2; + __isset = other155.__isset; return *this; } -PageHeader& PageHeader::operator=(PageHeader&& other140) noexcept { - type = other140.type; - uncompressed_page_size = other140.uncompressed_page_size; - compressed_page_size = other140.compressed_page_size; - crc = other140.crc; - data_page_header = std::move(other140.data_page_header); - index_page_header = std::move(other140.index_page_header); - dictionary_page_header = std::move(other140.dictionary_page_header); - data_page_header_v2 = std::move(other140.data_page_header_v2); - __isset = other140.__isset; +PageHeader& PageHeader::operator=(PageHeader&& other156) noexcept { + type = other156.type; + uncompressed_page_size = other156.uncompressed_page_size; + compressed_page_size = other156.compressed_page_size; + crc = other156.crc; + data_page_header = std::move(other156.data_page_header); + index_page_header = std::move(other156.index_page_header); + dictionary_page_header = std::move(other156.dictionary_page_header); + data_page_header_v2 = std::move(other156.data_page_header_v2); + __isset = other156.__isset; return *this; } void PageHeader::printTo(std::ostream& out) const { @@ -5000,26 +5239,26 @@ void swap(KeyValue &a, KeyValue &b) { swap(a.__isset, b.__isset); } -KeyValue::KeyValue(const KeyValue& other141) { - key = other141.key; - value = other141.value; - __isset = other141.__isset; +KeyValue::KeyValue(const KeyValue& other157) { + key = other157.key; + value = other157.value; + __isset = other157.__isset; } -KeyValue::KeyValue(KeyValue&& other142) noexcept { - key = std::move(other142.key); - value = std::move(other142.value); - __isset = other142.__isset; +KeyValue::KeyValue(KeyValue&& other158) noexcept { + key = std::move(other158.key); + value = std::move(other158.value); + __isset = other158.__isset; } -KeyValue& KeyValue::operator=(const KeyValue& other143) { - key = other143.key; - value = other143.value; - __isset = other143.__isset; +KeyValue& KeyValue::operator=(const KeyValue& other159) { + key = other159.key; + value = other159.value; + __isset = other159.__isset; return *this; } -KeyValue& KeyValue::operator=(KeyValue&& other144) noexcept { - key = std::move(other144.key); - value = std::move(other144.value); - __isset = other144.__isset; +KeyValue& KeyValue::operator=(KeyValue&& other160) noexcept { + key = std::move(other160.key); + value = std::move(other160.value); + __isset = other160.__isset; return *this; } void KeyValue::printTo(std::ostream& out) const { @@ -5148,26 +5387,26 @@ void swap(SortingColumn &a, SortingColumn &b) { swap(a.nulls_first, b.nulls_first); } -SortingColumn::SortingColumn(const SortingColumn& other145) noexcept { - column_idx = other145.column_idx; - descending = other145.descending; - nulls_first = other145.nulls_first; +SortingColumn::SortingColumn(const SortingColumn& other161) noexcept { + column_idx = other161.column_idx; + descending = other161.descending; + nulls_first = other161.nulls_first; } -SortingColumn::SortingColumn(SortingColumn&& other146) noexcept { - column_idx = other146.column_idx; - descending = other146.descending; - nulls_first = other146.nulls_first; +SortingColumn::SortingColumn(SortingColumn&& other162) noexcept { + column_idx = other162.column_idx; + descending = other162.descending; + nulls_first = other162.nulls_first; } -SortingColumn& SortingColumn::operator=(const SortingColumn& other147) noexcept { - column_idx = other147.column_idx; - descending = other147.descending; - nulls_first = other147.nulls_first; +SortingColumn& SortingColumn::operator=(const SortingColumn& other163) noexcept { + column_idx = other163.column_idx; + descending = other163.descending; + nulls_first = other163.nulls_first; return *this; } -SortingColumn& SortingColumn::operator=(SortingColumn&& other148) noexcept { - column_idx = other148.column_idx; - descending = other148.descending; - nulls_first = other148.nulls_first; +SortingColumn& SortingColumn::operator=(SortingColumn&& other164) noexcept { + column_idx = other164.column_idx; + descending = other164.descending; + nulls_first = other164.nulls_first; return *this; } void SortingColumn::printTo(std::ostream& out) const { @@ -5228,9 +5467,9 @@ uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast149; - xfer += iprot->readI32(ecast149); - this->page_type = static_cast(ecast149); + int32_t ecast165; + xfer += iprot->readI32(ecast165); + this->page_type = static_cast(ecast165); isset_page_type = true; } else { xfer += iprot->skip(ftype); @@ -5238,9 +5477,9 @@ uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 2: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast150; - xfer += iprot->readI32(ecast150); - this->encoding = static_cast(ecast150); + int32_t ecast166; + xfer += iprot->readI32(ecast166); + this->encoding = static_cast(ecast166); isset_encoding = true; } else { xfer += iprot->skip(ftype); @@ -5301,26 +5540,26 @@ void swap(PageEncodingStats &a, PageEncodingStats &b) { swap(a.count, b.count); } -PageEncodingStats::PageEncodingStats(const PageEncodingStats& other151) noexcept { - page_type = other151.page_type; - encoding = other151.encoding; - count = other151.count; +PageEncodingStats::PageEncodingStats(const PageEncodingStats& other167) noexcept { + page_type = other167.page_type; + encoding = other167.encoding; + count = other167.count; } -PageEncodingStats::PageEncodingStats(PageEncodingStats&& other152) noexcept { - page_type = other152.page_type; - encoding = other152.encoding; - count = other152.count; +PageEncodingStats::PageEncodingStats(PageEncodingStats&& other168) noexcept { + page_type = other168.page_type; + encoding = other168.encoding; + count = other168.count; } -PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other153) noexcept { - page_type = other153.page_type; - encoding = other153.encoding; - count = other153.count; +PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other169) noexcept { + page_type = other169.page_type; + encoding = other169.encoding; + count = other169.count; return *this; } -PageEncodingStats& PageEncodingStats::operator=(PageEncodingStats&& other154) noexcept { - page_type = other154.page_type; - encoding = other154.encoding; - count = other154.count; +PageEncodingStats& PageEncodingStats::operator=(PageEncodingStats&& other170) noexcept { + page_type = other170.page_type; + encoding = other170.encoding; + count = other170.count; return *this; } void PageEncodingStats::printTo(std::ostream& out) const { @@ -5398,6 +5637,16 @@ void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) { this->bloom_filter_offset = val; __isset.bloom_filter_offset = true; } + +void ColumnMetaData::__set_bloom_filter_length(const int32_t val) { + this->bloom_filter_length = val; +__isset.bloom_filter_length = true; +} + +void ColumnMetaData::__set_size_statistics(const SizeStatistics& val) { + this->size_statistics = val; +__isset.size_statistics = true; +} std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj) { obj.printTo(out); @@ -5436,9 +5685,9 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { { case 1: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast155; - xfer += iprot->readI32(ecast155); - this->type = static_cast(ecast155); + int32_t ecast171; + xfer += iprot->readI32(ecast171); + this->type = static_cast(ecast171); isset_type = true; } else { xfer += iprot->skip(ftype); @@ -5448,16 +5697,16 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->encodings.clear(); - uint32_t _size156; - ::apache::thrift::protocol::TType _etype159; - xfer += iprot->readListBegin(_etype159, _size156); - this->encodings.resize(_size156); - uint32_t _i160; - for (_i160 = 0; _i160 < _size156; ++_i160) + uint32_t _size172; + ::apache::thrift::protocol::TType _etype175; + xfer += iprot->readListBegin(_etype175, _size172); + this->encodings.resize(_size172); + uint32_t _i176; + for (_i176 = 0; _i176 < _size172; ++_i176) { - int32_t ecast161; - xfer += iprot->readI32(ecast161); - this->encodings[_i160] = static_cast(ecast161); + int32_t ecast177; + xfer += iprot->readI32(ecast177); + this->encodings[_i176] = static_cast(ecast177); } xfer += iprot->readListEnd(); } @@ -5470,14 +5719,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->path_in_schema.clear(); - uint32_t _size162; - ::apache::thrift::protocol::TType _etype165; - xfer += iprot->readListBegin(_etype165, _size162); - this->path_in_schema.resize(_size162); - uint32_t _i166; - for (_i166 = 0; _i166 < _size162; ++_i166) + uint32_t _size178; + ::apache::thrift::protocol::TType _etype181; + xfer += iprot->readListBegin(_etype181, _size178); + this->path_in_schema.resize(_size178); + uint32_t _i182; + for (_i182 = 0; _i182 < _size178; ++_i182) { - xfer += iprot->readString(this->path_in_schema[_i166]); + xfer += iprot->readString(this->path_in_schema[_i182]); } xfer += iprot->readListEnd(); } @@ -5488,9 +5737,9 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast167; - xfer += iprot->readI32(ecast167); - this->codec = static_cast(ecast167); + int32_t ecast183; + xfer += iprot->readI32(ecast183); + this->codec = static_cast(ecast183); isset_codec = true; } else { xfer += iprot->skip(ftype); @@ -5524,14 +5773,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->key_value_metadata.clear(); - uint32_t _size168; - ::apache::thrift::protocol::TType _etype171; - xfer += iprot->readListBegin(_etype171, _size168); - this->key_value_metadata.resize(_size168); - uint32_t _i172; - for (_i172 = 0; _i172 < _size168; ++_i172) + uint32_t _size184; + ::apache::thrift::protocol::TType _etype187; + xfer += iprot->readListBegin(_etype187, _size184); + this->key_value_metadata.resize(_size184); + uint32_t _i188; + for (_i188 = 0; _i188 < _size184; ++_i188) { - xfer += this->key_value_metadata[_i172].read(iprot); + xfer += this->key_value_metadata[_i188].read(iprot); } xfer += iprot->readListEnd(); } @@ -5576,14 +5825,14 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->encoding_stats.clear(); - uint32_t _size173; - ::apache::thrift::protocol::TType _etype176; - xfer += iprot->readListBegin(_etype176, _size173); - this->encoding_stats.resize(_size173); - uint32_t _i177; - for (_i177 = 0; _i177 < _size173; ++_i177) + uint32_t _size189; + ::apache::thrift::protocol::TType _etype192; + xfer += iprot->readListBegin(_etype192, _size189); + this->encoding_stats.resize(_size189); + uint32_t _i193; + for (_i193 = 0; _i193 < _size189; ++_i193) { - xfer += this->encoding_stats[_i177].read(iprot); + xfer += this->encoding_stats[_i193].read(iprot); } xfer += iprot->readListEnd(); } @@ -5600,6 +5849,22 @@ uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 15: + if (ftype == ::apache::thrift::protocol::T_I32) { + xfer += iprot->readI32(this->bloom_filter_length); + this->__isset.bloom_filter_length = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 16: + if (ftype == ::apache::thrift::protocol::T_STRUCT) { + xfer += this->size_statistics.read(iprot); + this->__isset.size_statistics = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -5640,10 +5905,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast(this->encodings.size())); - std::vector ::const_iterator _iter178; - for (_iter178 = this->encodings.begin(); _iter178 != this->encodings.end(); ++_iter178) + std::vector ::const_iterator _iter194; + for (_iter194 = this->encodings.begin(); _iter194 != this->encodings.end(); ++_iter194) { - xfer += oprot->writeI32(static_cast((*_iter178))); + xfer += oprot->writeI32(static_cast((*_iter194))); } xfer += oprot->writeListEnd(); } @@ -5652,10 +5917,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->path_in_schema.size())); - std::vector ::const_iterator _iter179; - for (_iter179 = this->path_in_schema.begin(); _iter179 != this->path_in_schema.end(); ++_iter179) + std::vector ::const_iterator _iter195; + for (_iter195 = this->path_in_schema.begin(); _iter195 != this->path_in_schema.end(); ++_iter195) { - xfer += oprot->writeString((*_iter179)); + xfer += oprot->writeString((*_iter195)); } xfer += oprot->writeListEnd(); } @@ -5681,10 +5946,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->key_value_metadata.size())); - std::vector ::const_iterator _iter180; - for (_iter180 = this->key_value_metadata.begin(); _iter180 != this->key_value_metadata.end(); ++_iter180) + std::vector ::const_iterator _iter196; + for (_iter196 = this->key_value_metadata.begin(); _iter196 != this->key_value_metadata.end(); ++_iter196) { - xfer += (*_iter180).write(oprot); + xfer += (*_iter196).write(oprot); } xfer += oprot->writeListEnd(); } @@ -5713,10 +5978,10 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->encoding_stats.size())); - std::vector ::const_iterator _iter181; - for (_iter181 = this->encoding_stats.begin(); _iter181 != this->encoding_stats.end(); ++_iter181) + std::vector ::const_iterator _iter197; + for (_iter197 = this->encoding_stats.begin(); _iter197 != this->encoding_stats.end(); ++_iter197) { - xfer += (*_iter181).write(oprot); + xfer += (*_iter197).write(oprot); } xfer += oprot->writeListEnd(); } @@ -5727,6 +5992,16 @@ uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) con xfer += oprot->writeI64(this->bloom_filter_offset); xfer += oprot->writeFieldEnd(); } + if (this->__isset.bloom_filter_length) { + xfer += oprot->writeFieldBegin("bloom_filter_length", ::apache::thrift::protocol::T_I32, 15); + xfer += oprot->writeI32(this->bloom_filter_length); + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.size_statistics) { + xfer += oprot->writeFieldBegin("size_statistics", ::apache::thrift::protocol::T_STRUCT, 16); + xfer += this->size_statistics.write(oprot); + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -5748,77 +6023,87 @@ void swap(ColumnMetaData &a, ColumnMetaData &b) { swap(a.statistics, b.statistics); swap(a.encoding_stats, b.encoding_stats); swap(a.bloom_filter_offset, b.bloom_filter_offset); + swap(a.bloom_filter_length, b.bloom_filter_length); + swap(a.size_statistics, b.size_statistics); swap(a.__isset, b.__isset); } -ColumnMetaData::ColumnMetaData(const ColumnMetaData& other182) { - type = other182.type; - encodings = other182.encodings; - path_in_schema = other182.path_in_schema; - codec = other182.codec; - num_values = other182.num_values; - total_uncompressed_size = other182.total_uncompressed_size; - total_compressed_size = other182.total_compressed_size; - key_value_metadata = other182.key_value_metadata; - data_page_offset = other182.data_page_offset; - index_page_offset = other182.index_page_offset; - dictionary_page_offset = other182.dictionary_page_offset; - statistics = other182.statistics; - encoding_stats = other182.encoding_stats; - bloom_filter_offset = other182.bloom_filter_offset; - __isset = other182.__isset; -} -ColumnMetaData::ColumnMetaData(ColumnMetaData&& other183) noexcept { - type = other183.type; - encodings = std::move(other183.encodings); - path_in_schema = std::move(other183.path_in_schema); - codec = other183.codec; - num_values = other183.num_values; - total_uncompressed_size = other183.total_uncompressed_size; - total_compressed_size = other183.total_compressed_size; - key_value_metadata = std::move(other183.key_value_metadata); - data_page_offset = other183.data_page_offset; - index_page_offset = other183.index_page_offset; - dictionary_page_offset = other183.dictionary_page_offset; - statistics = std::move(other183.statistics); - encoding_stats = std::move(other183.encoding_stats); - bloom_filter_offset = other183.bloom_filter_offset; - __isset = other183.__isset; -} -ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other184) { - type = other184.type; - encodings = other184.encodings; - path_in_schema = other184.path_in_schema; - codec = other184.codec; - num_values = other184.num_values; - total_uncompressed_size = other184.total_uncompressed_size; - total_compressed_size = other184.total_compressed_size; - key_value_metadata = other184.key_value_metadata; - data_page_offset = other184.data_page_offset; - index_page_offset = other184.index_page_offset; - dictionary_page_offset = other184.dictionary_page_offset; - statistics = other184.statistics; - encoding_stats = other184.encoding_stats; - bloom_filter_offset = other184.bloom_filter_offset; - __isset = other184.__isset; +ColumnMetaData::ColumnMetaData(const ColumnMetaData& other198) { + type = other198.type; + encodings = other198.encodings; + path_in_schema = other198.path_in_schema; + codec = other198.codec; + num_values = other198.num_values; + total_uncompressed_size = other198.total_uncompressed_size; + total_compressed_size = other198.total_compressed_size; + key_value_metadata = other198.key_value_metadata; + data_page_offset = other198.data_page_offset; + index_page_offset = other198.index_page_offset; + dictionary_page_offset = other198.dictionary_page_offset; + statistics = other198.statistics; + encoding_stats = other198.encoding_stats; + bloom_filter_offset = other198.bloom_filter_offset; + bloom_filter_length = other198.bloom_filter_length; + size_statistics = other198.size_statistics; + __isset = other198.__isset; +} +ColumnMetaData::ColumnMetaData(ColumnMetaData&& other199) noexcept { + type = other199.type; + encodings = std::move(other199.encodings); + path_in_schema = std::move(other199.path_in_schema); + codec = other199.codec; + num_values = other199.num_values; + total_uncompressed_size = other199.total_uncompressed_size; + total_compressed_size = other199.total_compressed_size; + key_value_metadata = std::move(other199.key_value_metadata); + data_page_offset = other199.data_page_offset; + index_page_offset = other199.index_page_offset; + dictionary_page_offset = other199.dictionary_page_offset; + statistics = std::move(other199.statistics); + encoding_stats = std::move(other199.encoding_stats); + bloom_filter_offset = other199.bloom_filter_offset; + bloom_filter_length = other199.bloom_filter_length; + size_statistics = std::move(other199.size_statistics); + __isset = other199.__isset; +} +ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other200) { + type = other200.type; + encodings = other200.encodings; + path_in_schema = other200.path_in_schema; + codec = other200.codec; + num_values = other200.num_values; + total_uncompressed_size = other200.total_uncompressed_size; + total_compressed_size = other200.total_compressed_size; + key_value_metadata = other200.key_value_metadata; + data_page_offset = other200.data_page_offset; + index_page_offset = other200.index_page_offset; + dictionary_page_offset = other200.dictionary_page_offset; + statistics = other200.statistics; + encoding_stats = other200.encoding_stats; + bloom_filter_offset = other200.bloom_filter_offset; + bloom_filter_length = other200.bloom_filter_length; + size_statistics = other200.size_statistics; + __isset = other200.__isset; return *this; } -ColumnMetaData& ColumnMetaData::operator=(ColumnMetaData&& other185) noexcept { - type = other185.type; - encodings = std::move(other185.encodings); - path_in_schema = std::move(other185.path_in_schema); - codec = other185.codec; - num_values = other185.num_values; - total_uncompressed_size = other185.total_uncompressed_size; - total_compressed_size = other185.total_compressed_size; - key_value_metadata = std::move(other185.key_value_metadata); - data_page_offset = other185.data_page_offset; - index_page_offset = other185.index_page_offset; - dictionary_page_offset = other185.dictionary_page_offset; - statistics = std::move(other185.statistics); - encoding_stats = std::move(other185.encoding_stats); - bloom_filter_offset = other185.bloom_filter_offset; - __isset = other185.__isset; +ColumnMetaData& ColumnMetaData::operator=(ColumnMetaData&& other201) noexcept { + type = other201.type; + encodings = std::move(other201.encodings); + path_in_schema = std::move(other201.path_in_schema); + codec = other201.codec; + num_values = other201.num_values; + total_uncompressed_size = other201.total_uncompressed_size; + total_compressed_size = other201.total_compressed_size; + key_value_metadata = std::move(other201.key_value_metadata); + data_page_offset = other201.data_page_offset; + index_page_offset = other201.index_page_offset; + dictionary_page_offset = other201.dictionary_page_offset; + statistics = std::move(other201.statistics); + encoding_stats = std::move(other201.encoding_stats); + bloom_filter_offset = other201.bloom_filter_offset; + bloom_filter_length = other201.bloom_filter_length; + size_statistics = std::move(other201.size_statistics); + __isset = other201.__isset; return *this; } void ColumnMetaData::printTo(std::ostream& out) const { @@ -5838,6 +6123,8 @@ void ColumnMetaData::printTo(std::ostream& out) const { out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "")); out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "")); out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "")); + out << ", " << "bloom_filter_length="; (__isset.bloom_filter_length ? (out << to_string(bloom_filter_length)) : (out << "")); + out << ", " << "size_statistics="; (__isset.size_statistics ? (out << to_string(size_statistics)) : (out << "")); out << ")"; } @@ -5896,18 +6183,18 @@ void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) { (void) b; } -EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other186) noexcept { - (void) other186; +EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other202) noexcept { + (void) other202; } -EncryptionWithFooterKey::EncryptionWithFooterKey(EncryptionWithFooterKey&& other187) noexcept { - (void) other187; +EncryptionWithFooterKey::EncryptionWithFooterKey(EncryptionWithFooterKey&& other203) noexcept { + (void) other203; } -EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other188) noexcept { - (void) other188; +EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other204) noexcept { + (void) other204; return *this; } -EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(EncryptionWithFooterKey&& other189) noexcept { - (void) other189; +EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(EncryptionWithFooterKey&& other205) noexcept { + (void) other205; return *this; } void EncryptionWithFooterKey::printTo(std::ostream& out) const { @@ -5962,14 +6249,14 @@ uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* ip if (ftype == ::apache::thrift::protocol::T_LIST) { { this->path_in_schema.clear(); - uint32_t _size190; - ::apache::thrift::protocol::TType _etype193; - xfer += iprot->readListBegin(_etype193, _size190); - this->path_in_schema.resize(_size190); - uint32_t _i194; - for (_i194 = 0; _i194 < _size190; ++_i194) + uint32_t _size206; + ::apache::thrift::protocol::TType _etype209; + xfer += iprot->readListBegin(_etype209, _size206); + this->path_in_schema.resize(_size206); + uint32_t _i210; + for (_i210 = 0; _i210 < _size206; ++_i210) { - xfer += iprot->readString(this->path_in_schema[_i194]); + xfer += iprot->readString(this->path_in_schema[_i210]); } xfer += iprot->readListEnd(); } @@ -6008,10 +6295,10 @@ uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* o xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->path_in_schema.size())); - std::vector ::const_iterator _iter195; - for (_iter195 = this->path_in_schema.begin(); _iter195 != this->path_in_schema.end(); ++_iter195) + std::vector ::const_iterator _iter211; + for (_iter211 = this->path_in_schema.begin(); _iter211 != this->path_in_schema.end(); ++_iter211) { - xfer += oprot->writeString((*_iter195)); + xfer += oprot->writeString((*_iter211)); } xfer += oprot->writeListEnd(); } @@ -6034,26 +6321,26 @@ void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) { swap(a.__isset, b.__isset); } -EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other196) { - path_in_schema = other196.path_in_schema; - key_metadata = other196.key_metadata; - __isset = other196.__isset; +EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other212) { + path_in_schema = other212.path_in_schema; + key_metadata = other212.key_metadata; + __isset = other212.__isset; } -EncryptionWithColumnKey::EncryptionWithColumnKey(EncryptionWithColumnKey&& other197) noexcept { - path_in_schema = std::move(other197.path_in_schema); - key_metadata = std::move(other197.key_metadata); - __isset = other197.__isset; +EncryptionWithColumnKey::EncryptionWithColumnKey(EncryptionWithColumnKey&& other213) noexcept { + path_in_schema = std::move(other213.path_in_schema); + key_metadata = std::move(other213.key_metadata); + __isset = other213.__isset; } -EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other198) { - path_in_schema = other198.path_in_schema; - key_metadata = other198.key_metadata; - __isset = other198.__isset; +EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other214) { + path_in_schema = other214.path_in_schema; + key_metadata = other214.key_metadata; + __isset = other214.__isset; return *this; } -EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(EncryptionWithColumnKey&& other199) noexcept { - path_in_schema = std::move(other199.path_in_schema); - key_metadata = std::move(other199.key_metadata); - __isset = other199.__isset; +EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(EncryptionWithColumnKey&& other215) noexcept { + path_in_schema = std::move(other215.path_in_schema); + key_metadata = std::move(other215.key_metadata); + __isset = other215.__isset; return *this; } void EncryptionWithColumnKey::printTo(std::ostream& out) const { @@ -6161,26 +6448,26 @@ void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) { swap(a.__isset, b.__isset); } -ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other200) { - ENCRYPTION_WITH_FOOTER_KEY = other200.ENCRYPTION_WITH_FOOTER_KEY; - ENCRYPTION_WITH_COLUMN_KEY = other200.ENCRYPTION_WITH_COLUMN_KEY; - __isset = other200.__isset; +ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other216) { + ENCRYPTION_WITH_FOOTER_KEY = other216.ENCRYPTION_WITH_FOOTER_KEY; + ENCRYPTION_WITH_COLUMN_KEY = other216.ENCRYPTION_WITH_COLUMN_KEY; + __isset = other216.__isset; } -ColumnCryptoMetaData::ColumnCryptoMetaData(ColumnCryptoMetaData&& other201) noexcept { - ENCRYPTION_WITH_FOOTER_KEY = std::move(other201.ENCRYPTION_WITH_FOOTER_KEY); - ENCRYPTION_WITH_COLUMN_KEY = std::move(other201.ENCRYPTION_WITH_COLUMN_KEY); - __isset = other201.__isset; +ColumnCryptoMetaData::ColumnCryptoMetaData(ColumnCryptoMetaData&& other217) noexcept { + ENCRYPTION_WITH_FOOTER_KEY = std::move(other217.ENCRYPTION_WITH_FOOTER_KEY); + ENCRYPTION_WITH_COLUMN_KEY = std::move(other217.ENCRYPTION_WITH_COLUMN_KEY); + __isset = other217.__isset; } -ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other202) { - ENCRYPTION_WITH_FOOTER_KEY = other202.ENCRYPTION_WITH_FOOTER_KEY; - ENCRYPTION_WITH_COLUMN_KEY = other202.ENCRYPTION_WITH_COLUMN_KEY; - __isset = other202.__isset; +ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other218) { + ENCRYPTION_WITH_FOOTER_KEY = other218.ENCRYPTION_WITH_FOOTER_KEY; + ENCRYPTION_WITH_COLUMN_KEY = other218.ENCRYPTION_WITH_COLUMN_KEY; + __isset = other218.__isset; return *this; } -ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(ColumnCryptoMetaData&& other203) noexcept { - ENCRYPTION_WITH_FOOTER_KEY = std::move(other203.ENCRYPTION_WITH_FOOTER_KEY); - ENCRYPTION_WITH_COLUMN_KEY = std::move(other203.ENCRYPTION_WITH_COLUMN_KEY); - __isset = other203.__isset; +ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(ColumnCryptoMetaData&& other219) noexcept { + ENCRYPTION_WITH_FOOTER_KEY = std::move(other219.ENCRYPTION_WITH_FOOTER_KEY); + ENCRYPTION_WITH_COLUMN_KEY = std::move(other219.ENCRYPTION_WITH_COLUMN_KEY); + __isset = other219.__isset; return *this; } void ColumnCryptoMetaData::printTo(std::ostream& out) const { @@ -6422,54 +6709,54 @@ void swap(ColumnChunk &a, ColumnChunk &b) { swap(a.__isset, b.__isset); } -ColumnChunk::ColumnChunk(const ColumnChunk& other204) { - file_path = other204.file_path; - file_offset = other204.file_offset; - meta_data = other204.meta_data; - offset_index_offset = other204.offset_index_offset; - offset_index_length = other204.offset_index_length; - column_index_offset = other204.column_index_offset; - column_index_length = other204.column_index_length; - crypto_metadata = other204.crypto_metadata; - encrypted_column_metadata = other204.encrypted_column_metadata; - __isset = other204.__isset; -} -ColumnChunk::ColumnChunk(ColumnChunk&& other205) noexcept { - file_path = std::move(other205.file_path); - file_offset = other205.file_offset; - meta_data = std::move(other205.meta_data); - offset_index_offset = other205.offset_index_offset; - offset_index_length = other205.offset_index_length; - column_index_offset = other205.column_index_offset; - column_index_length = other205.column_index_length; - crypto_metadata = std::move(other205.crypto_metadata); - encrypted_column_metadata = std::move(other205.encrypted_column_metadata); - __isset = other205.__isset; -} -ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other206) { - file_path = other206.file_path; - file_offset = other206.file_offset; - meta_data = other206.meta_data; - offset_index_offset = other206.offset_index_offset; - offset_index_length = other206.offset_index_length; - column_index_offset = other206.column_index_offset; - column_index_length = other206.column_index_length; - crypto_metadata = other206.crypto_metadata; - encrypted_column_metadata = other206.encrypted_column_metadata; - __isset = other206.__isset; +ColumnChunk::ColumnChunk(const ColumnChunk& other220) { + file_path = other220.file_path; + file_offset = other220.file_offset; + meta_data = other220.meta_data; + offset_index_offset = other220.offset_index_offset; + offset_index_length = other220.offset_index_length; + column_index_offset = other220.column_index_offset; + column_index_length = other220.column_index_length; + crypto_metadata = other220.crypto_metadata; + encrypted_column_metadata = other220.encrypted_column_metadata; + __isset = other220.__isset; +} +ColumnChunk::ColumnChunk(ColumnChunk&& other221) noexcept { + file_path = std::move(other221.file_path); + file_offset = other221.file_offset; + meta_data = std::move(other221.meta_data); + offset_index_offset = other221.offset_index_offset; + offset_index_length = other221.offset_index_length; + column_index_offset = other221.column_index_offset; + column_index_length = other221.column_index_length; + crypto_metadata = std::move(other221.crypto_metadata); + encrypted_column_metadata = std::move(other221.encrypted_column_metadata); + __isset = other221.__isset; +} +ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other222) { + file_path = other222.file_path; + file_offset = other222.file_offset; + meta_data = other222.meta_data; + offset_index_offset = other222.offset_index_offset; + offset_index_length = other222.offset_index_length; + column_index_offset = other222.column_index_offset; + column_index_length = other222.column_index_length; + crypto_metadata = other222.crypto_metadata; + encrypted_column_metadata = other222.encrypted_column_metadata; + __isset = other222.__isset; return *this; } -ColumnChunk& ColumnChunk::operator=(ColumnChunk&& other207) noexcept { - file_path = std::move(other207.file_path); - file_offset = other207.file_offset; - meta_data = std::move(other207.meta_data); - offset_index_offset = other207.offset_index_offset; - offset_index_length = other207.offset_index_length; - column_index_offset = other207.column_index_offset; - column_index_length = other207.column_index_length; - crypto_metadata = std::move(other207.crypto_metadata); - encrypted_column_metadata = std::move(other207.encrypted_column_metadata); - __isset = other207.__isset; +ColumnChunk& ColumnChunk::operator=(ColumnChunk&& other223) noexcept { + file_path = std::move(other223.file_path); + file_offset = other223.file_offset; + meta_data = std::move(other223.meta_data); + offset_index_offset = other223.offset_index_offset; + offset_index_length = other223.offset_index_length; + column_index_offset = other223.column_index_offset; + column_index_length = other223.column_index_length; + crypto_metadata = std::move(other223.crypto_metadata); + encrypted_column_metadata = std::move(other223.encrypted_column_metadata); + __isset = other223.__isset; return *this; } void ColumnChunk::printTo(std::ostream& out) const { @@ -6558,14 +6845,14 @@ uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->columns.clear(); - uint32_t _size208; - ::apache::thrift::protocol::TType _etype211; - xfer += iprot->readListBegin(_etype211, _size208); - this->columns.resize(_size208); - uint32_t _i212; - for (_i212 = 0; _i212 < _size208; ++_i212) + uint32_t _size224; + ::apache::thrift::protocol::TType _etype227; + xfer += iprot->readListBegin(_etype227, _size224); + this->columns.resize(_size224); + uint32_t _i228; + for (_i228 = 0; _i228 < _size224; ++_i228) { - xfer += this->columns[_i212].read(iprot); + xfer += this->columns[_i228].read(iprot); } xfer += iprot->readListEnd(); } @@ -6594,14 +6881,14 @@ uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->sorting_columns.clear(); - uint32_t _size213; - ::apache::thrift::protocol::TType _etype216; - xfer += iprot->readListBegin(_etype216, _size213); - this->sorting_columns.resize(_size213); - uint32_t _i217; - for (_i217 = 0; _i217 < _size213; ++_i217) + uint32_t _size229; + ::apache::thrift::protocol::TType _etype232; + xfer += iprot->readListBegin(_etype232, _size229); + this->sorting_columns.resize(_size229); + uint32_t _i233; + for (_i233 = 0; _i233 < _size229; ++_i233) { - xfer += this->sorting_columns[_i217].read(iprot); + xfer += this->sorting_columns[_i233].read(iprot); } xfer += iprot->readListEnd(); } @@ -6660,10 +6947,10 @@ uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->columns.size())); - std::vector ::const_iterator _iter218; - for (_iter218 = this->columns.begin(); _iter218 != this->columns.end(); ++_iter218) + std::vector ::const_iterator _iter234; + for (_iter234 = this->columns.begin(); _iter234 != this->columns.end(); ++_iter234) { - xfer += (*_iter218).write(oprot); + xfer += (*_iter234).write(oprot); } xfer += oprot->writeListEnd(); } @@ -6681,10 +6968,10 @@ uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const { xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->sorting_columns.size())); - std::vector ::const_iterator _iter219; - for (_iter219 = this->sorting_columns.begin(); _iter219 != this->sorting_columns.end(); ++_iter219) + std::vector ::const_iterator _iter235; + for (_iter235 = this->sorting_columns.begin(); _iter235 != this->sorting_columns.end(); ++_iter235) { - xfer += (*_iter219).write(oprot); + xfer += (*_iter235).write(oprot); } xfer += oprot->writeListEnd(); } @@ -6722,46 +7009,46 @@ void swap(RowGroup &a, RowGroup &b) { swap(a.__isset, b.__isset); } -RowGroup::RowGroup(const RowGroup& other220) { - columns = other220.columns; - total_byte_size = other220.total_byte_size; - num_rows = other220.num_rows; - sorting_columns = other220.sorting_columns; - file_offset = other220.file_offset; - total_compressed_size = other220.total_compressed_size; - ordinal = other220.ordinal; - __isset = other220.__isset; -} -RowGroup::RowGroup(RowGroup&& other221) noexcept { - columns = std::move(other221.columns); - total_byte_size = other221.total_byte_size; - num_rows = other221.num_rows; - sorting_columns = std::move(other221.sorting_columns); - file_offset = other221.file_offset; - total_compressed_size = other221.total_compressed_size; - ordinal = other221.ordinal; - __isset = other221.__isset; -} -RowGroup& RowGroup::operator=(const RowGroup& other222) { - columns = other222.columns; - total_byte_size = other222.total_byte_size; - num_rows = other222.num_rows; - sorting_columns = other222.sorting_columns; - file_offset = other222.file_offset; - total_compressed_size = other222.total_compressed_size; - ordinal = other222.ordinal; - __isset = other222.__isset; +RowGroup::RowGroup(const RowGroup& other236) { + columns = other236.columns; + total_byte_size = other236.total_byte_size; + num_rows = other236.num_rows; + sorting_columns = other236.sorting_columns; + file_offset = other236.file_offset; + total_compressed_size = other236.total_compressed_size; + ordinal = other236.ordinal; + __isset = other236.__isset; +} +RowGroup::RowGroup(RowGroup&& other237) noexcept { + columns = std::move(other237.columns); + total_byte_size = other237.total_byte_size; + num_rows = other237.num_rows; + sorting_columns = std::move(other237.sorting_columns); + file_offset = other237.file_offset; + total_compressed_size = other237.total_compressed_size; + ordinal = other237.ordinal; + __isset = other237.__isset; +} +RowGroup& RowGroup::operator=(const RowGroup& other238) { + columns = other238.columns; + total_byte_size = other238.total_byte_size; + num_rows = other238.num_rows; + sorting_columns = other238.sorting_columns; + file_offset = other238.file_offset; + total_compressed_size = other238.total_compressed_size; + ordinal = other238.ordinal; + __isset = other238.__isset; return *this; } -RowGroup& RowGroup::operator=(RowGroup&& other223) noexcept { - columns = std::move(other223.columns); - total_byte_size = other223.total_byte_size; - num_rows = other223.num_rows; - sorting_columns = std::move(other223.sorting_columns); - file_offset = other223.file_offset; - total_compressed_size = other223.total_compressed_size; - ordinal = other223.ordinal; - __isset = other223.__isset; +RowGroup& RowGroup::operator=(RowGroup&& other239) noexcept { + columns = std::move(other239.columns); + total_byte_size = other239.total_byte_size; + num_rows = other239.num_rows; + sorting_columns = std::move(other239.sorting_columns); + file_offset = other239.file_offset; + total_compressed_size = other239.total_compressed_size; + ordinal = other239.ordinal; + __isset = other239.__isset; return *this; } void RowGroup::printTo(std::ostream& out) const { @@ -6832,18 +7119,18 @@ void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) { (void) b; } -TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other224) noexcept { - (void) other224; +TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other240) noexcept { + (void) other240; } -TypeDefinedOrder::TypeDefinedOrder(TypeDefinedOrder&& other225) noexcept { - (void) other225; +TypeDefinedOrder::TypeDefinedOrder(TypeDefinedOrder&& other241) noexcept { + (void) other241; } -TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other226) noexcept { - (void) other226; +TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other242) noexcept { + (void) other242; return *this; } -TypeDefinedOrder& TypeDefinedOrder::operator=(TypeDefinedOrder&& other227) noexcept { - (void) other227; +TypeDefinedOrder& TypeDefinedOrder::operator=(TypeDefinedOrder&& other243) noexcept { + (void) other243; return *this; } void TypeDefinedOrder::printTo(std::ostream& out) const { @@ -6930,22 +7217,22 @@ void swap(ColumnOrder &a, ColumnOrder &b) { swap(a.__isset, b.__isset); } -ColumnOrder::ColumnOrder(const ColumnOrder& other228) noexcept { - TYPE_ORDER = other228.TYPE_ORDER; - __isset = other228.__isset; +ColumnOrder::ColumnOrder(const ColumnOrder& other244) noexcept { + TYPE_ORDER = other244.TYPE_ORDER; + __isset = other244.__isset; } -ColumnOrder::ColumnOrder(ColumnOrder&& other229) noexcept { - TYPE_ORDER = std::move(other229.TYPE_ORDER); - __isset = other229.__isset; +ColumnOrder::ColumnOrder(ColumnOrder&& other245) noexcept { + TYPE_ORDER = std::move(other245.TYPE_ORDER); + __isset = other245.__isset; } -ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other230) noexcept { - TYPE_ORDER = other230.TYPE_ORDER; - __isset = other230.__isset; +ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other246) noexcept { + TYPE_ORDER = other246.TYPE_ORDER; + __isset = other246.__isset; return *this; } -ColumnOrder& ColumnOrder::operator=(ColumnOrder&& other231) noexcept { - TYPE_ORDER = std::move(other231.TYPE_ORDER); - __isset = other231.__isset; +ColumnOrder& ColumnOrder::operator=(ColumnOrder&& other247) noexcept { + TYPE_ORDER = std::move(other247.TYPE_ORDER); + __isset = other247.__isset; return *this; } void ColumnOrder::printTo(std::ostream& out) const { @@ -7073,26 +7360,26 @@ void swap(PageLocation &a, PageLocation &b) { swap(a.first_row_index, b.first_row_index); } -PageLocation::PageLocation(const PageLocation& other232) noexcept { - offset = other232.offset; - compressed_page_size = other232.compressed_page_size; - first_row_index = other232.first_row_index; +PageLocation::PageLocation(const PageLocation& other248) noexcept { + offset = other248.offset; + compressed_page_size = other248.compressed_page_size; + first_row_index = other248.first_row_index; } -PageLocation::PageLocation(PageLocation&& other233) noexcept { - offset = other233.offset; - compressed_page_size = other233.compressed_page_size; - first_row_index = other233.first_row_index; +PageLocation::PageLocation(PageLocation&& other249) noexcept { + offset = other249.offset; + compressed_page_size = other249.compressed_page_size; + first_row_index = other249.first_row_index; } -PageLocation& PageLocation::operator=(const PageLocation& other234) noexcept { - offset = other234.offset; - compressed_page_size = other234.compressed_page_size; - first_row_index = other234.first_row_index; +PageLocation& PageLocation::operator=(const PageLocation& other250) noexcept { + offset = other250.offset; + compressed_page_size = other250.compressed_page_size; + first_row_index = other250.first_row_index; return *this; } -PageLocation& PageLocation::operator=(PageLocation&& other235) noexcept { - offset = other235.offset; - compressed_page_size = other235.compressed_page_size; - first_row_index = other235.first_row_index; +PageLocation& PageLocation::operator=(PageLocation&& other251) noexcept { + offset = other251.offset; + compressed_page_size = other251.compressed_page_size; + first_row_index = other251.first_row_index; return *this; } void PageLocation::printTo(std::ostream& out) const { @@ -7112,6 +7399,11 @@ OffsetIndex::~OffsetIndex() noexcept { void OffsetIndex::__set_page_locations(const std::vector & val) { this->page_locations = val; } + +void OffsetIndex::__set_unencoded_byte_array_data_bytes(const std::vector & val) { + this->unencoded_byte_array_data_bytes = val; +__isset.unencoded_byte_array_data_bytes = true; +} std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj) { obj.printTo(out); @@ -7145,14 +7437,14 @@ uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->page_locations.clear(); - uint32_t _size236; - ::apache::thrift::protocol::TType _etype239; - xfer += iprot->readListBegin(_etype239, _size236); - this->page_locations.resize(_size236); - uint32_t _i240; - for (_i240 = 0; _i240 < _size236; ++_i240) + uint32_t _size252; + ::apache::thrift::protocol::TType _etype255; + xfer += iprot->readListBegin(_etype255, _size252); + this->page_locations.resize(_size252); + uint32_t _i256; + for (_i256 = 0; _i256 < _size252; ++_i256) { - xfer += this->page_locations[_i240].read(iprot); + xfer += this->page_locations[_i256].read(iprot); } xfer += iprot->readListEnd(); } @@ -7161,6 +7453,26 @@ uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 2: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->unencoded_byte_array_data_bytes.clear(); + uint32_t _size257; + ::apache::thrift::protocol::TType _etype260; + xfer += iprot->readListBegin(_etype260, _size257); + this->unencoded_byte_array_data_bytes.resize(_size257); + uint32_t _i261; + for (_i261 = 0; _i261 < _size257; ++_i261) + { + xfer += iprot->readI64(this->unencoded_byte_array_data_bytes[_i261]); + } + xfer += iprot->readListEnd(); + } + this->__isset.unencoded_byte_array_data_bytes = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -7183,15 +7495,28 @@ uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->page_locations.size())); - std::vector ::const_iterator _iter241; - for (_iter241 = this->page_locations.begin(); _iter241 != this->page_locations.end(); ++_iter241) + std::vector ::const_iterator _iter262; + for (_iter262 = this->page_locations.begin(); _iter262 != this->page_locations.end(); ++_iter262) { - xfer += (*_iter241).write(oprot); + xfer += (*_iter262).write(oprot); } xfer += oprot->writeListEnd(); } xfer += oprot->writeFieldEnd(); + if (this->__isset.unencoded_byte_array_data_bytes) { + xfer += oprot->writeFieldBegin("unencoded_byte_array_data_bytes", ::apache::thrift::protocol::T_LIST, 2); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->unencoded_byte_array_data_bytes.size())); + std::vector ::const_iterator _iter263; + for (_iter263 = this->unencoded_byte_array_data_bytes.begin(); _iter263 != this->unencoded_byte_array_data_bytes.end(); ++_iter263) + { + xfer += oprot->writeI64((*_iter263)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } xfer += oprot->writeFieldStop(); xfer += oprot->writeStructEnd(); return xfer; @@ -7200,26 +7525,37 @@ uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const void swap(OffsetIndex &a, OffsetIndex &b) { using ::std::swap; swap(a.page_locations, b.page_locations); + swap(a.unencoded_byte_array_data_bytes, b.unencoded_byte_array_data_bytes); + swap(a.__isset, b.__isset); } -OffsetIndex::OffsetIndex(const OffsetIndex& other242) { - page_locations = other242.page_locations; +OffsetIndex::OffsetIndex(const OffsetIndex& other264) { + page_locations = other264.page_locations; + unencoded_byte_array_data_bytes = other264.unencoded_byte_array_data_bytes; + __isset = other264.__isset; } -OffsetIndex::OffsetIndex(OffsetIndex&& other243) noexcept { - page_locations = std::move(other243.page_locations); +OffsetIndex::OffsetIndex(OffsetIndex&& other265) noexcept { + page_locations = std::move(other265.page_locations); + unencoded_byte_array_data_bytes = std::move(other265.unencoded_byte_array_data_bytes); + __isset = other265.__isset; } -OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other244) { - page_locations = other244.page_locations; +OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other266) { + page_locations = other266.page_locations; + unencoded_byte_array_data_bytes = other266.unencoded_byte_array_data_bytes; + __isset = other266.__isset; return *this; } -OffsetIndex& OffsetIndex::operator=(OffsetIndex&& other245) noexcept { - page_locations = std::move(other245.page_locations); +OffsetIndex& OffsetIndex::operator=(OffsetIndex&& other267) noexcept { + page_locations = std::move(other267.page_locations); + unencoded_byte_array_data_bytes = std::move(other267.unencoded_byte_array_data_bytes); + __isset = other267.__isset; return *this; } void OffsetIndex::printTo(std::ostream& out) const { using ::apache::thrift::to_string; out << "OffsetIndex("; out << "page_locations=" << to_string(page_locations); + out << ", " << "unencoded_byte_array_data_bytes="; (__isset.unencoded_byte_array_data_bytes ? (out << to_string(unencoded_byte_array_data_bytes)) : (out << "")); out << ")"; } @@ -7248,6 +7584,16 @@ void ColumnIndex::__set_null_counts(const std::vector & val) { this->null_counts = val; __isset.null_counts = true; } + +void ColumnIndex::__set_repetition_level_histograms(const std::vector & val) { + this->repetition_level_histograms = val; +__isset.repetition_level_histograms = true; +} + +void ColumnIndex::__set_definition_level_histograms(const std::vector & val) { + this->definition_level_histograms = val; +__isset.definition_level_histograms = true; +} std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj) { obj.printTo(out); @@ -7284,14 +7630,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->null_pages.clear(); - uint32_t _size246; - ::apache::thrift::protocol::TType _etype249; - xfer += iprot->readListBegin(_etype249, _size246); - this->null_pages.resize(_size246); - uint32_t _i250; - for (_i250 = 0; _i250 < _size246; ++_i250) + uint32_t _size268; + ::apache::thrift::protocol::TType _etype271; + xfer += iprot->readListBegin(_etype271, _size268); + this->null_pages.resize(_size268); + uint32_t _i272; + for (_i272 = 0; _i272 < _size268; ++_i272) { - xfer += iprot->readBool(this->null_pages[_i250]); + xfer += iprot->readBool(this->null_pages[_i272]); } xfer += iprot->readListEnd(); } @@ -7304,14 +7650,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->min_values.clear(); - uint32_t _size251; - ::apache::thrift::protocol::TType _etype254; - xfer += iprot->readListBegin(_etype254, _size251); - this->min_values.resize(_size251); - uint32_t _i255; - for (_i255 = 0; _i255 < _size251; ++_i255) + uint32_t _size273; + ::apache::thrift::protocol::TType _etype276; + xfer += iprot->readListBegin(_etype276, _size273); + this->min_values.resize(_size273); + uint32_t _i277; + for (_i277 = 0; _i277 < _size273; ++_i277) { - xfer += iprot->readBinary(this->min_values[_i255]); + xfer += iprot->readBinary(this->min_values[_i277]); } xfer += iprot->readListEnd(); } @@ -7324,14 +7670,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->max_values.clear(); - uint32_t _size256; - ::apache::thrift::protocol::TType _etype259; - xfer += iprot->readListBegin(_etype259, _size256); - this->max_values.resize(_size256); - uint32_t _i260; - for (_i260 = 0; _i260 < _size256; ++_i260) + uint32_t _size278; + ::apache::thrift::protocol::TType _etype281; + xfer += iprot->readListBegin(_etype281, _size278); + this->max_values.resize(_size278); + uint32_t _i282; + for (_i282 = 0; _i282 < _size278; ++_i282) { - xfer += iprot->readBinary(this->max_values[_i260]); + xfer += iprot->readBinary(this->max_values[_i282]); } xfer += iprot->readListEnd(); } @@ -7342,9 +7688,9 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { break; case 4: if (ftype == ::apache::thrift::protocol::T_I32) { - int32_t ecast261; - xfer += iprot->readI32(ecast261); - this->boundary_order = static_cast(ecast261); + int32_t ecast283; + xfer += iprot->readI32(ecast283); + this->boundary_order = static_cast(ecast283); isset_boundary_order = true; } else { xfer += iprot->skip(ftype); @@ -7354,14 +7700,14 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->null_counts.clear(); - uint32_t _size262; - ::apache::thrift::protocol::TType _etype265; - xfer += iprot->readListBegin(_etype265, _size262); - this->null_counts.resize(_size262); - uint32_t _i266; - for (_i266 = 0; _i266 < _size262; ++_i266) + uint32_t _size284; + ::apache::thrift::protocol::TType _etype287; + xfer += iprot->readListBegin(_etype287, _size284); + this->null_counts.resize(_size284); + uint32_t _i288; + for (_i288 = 0; _i288 < _size284; ++_i288) { - xfer += iprot->readI64(this->null_counts[_i266]); + xfer += iprot->readI64(this->null_counts[_i288]); } xfer += iprot->readListEnd(); } @@ -7370,6 +7716,46 @@ uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) { xfer += iprot->skip(ftype); } break; + case 6: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->repetition_level_histograms.clear(); + uint32_t _size289; + ::apache::thrift::protocol::TType _etype292; + xfer += iprot->readListBegin(_etype292, _size289); + this->repetition_level_histograms.resize(_size289); + uint32_t _i293; + for (_i293 = 0; _i293 < _size289; ++_i293) + { + xfer += iprot->readI64(this->repetition_level_histograms[_i293]); + } + xfer += iprot->readListEnd(); + } + this->__isset.repetition_level_histograms = true; + } else { + xfer += iprot->skip(ftype); + } + break; + case 7: + if (ftype == ::apache::thrift::protocol::T_LIST) { + { + this->definition_level_histograms.clear(); + uint32_t _size294; + ::apache::thrift::protocol::TType _etype297; + xfer += iprot->readListBegin(_etype297, _size294); + this->definition_level_histograms.resize(_size294); + uint32_t _i298; + for (_i298 = 0; _i298 < _size294; ++_i298) + { + xfer += iprot->readI64(this->definition_level_histograms[_i298]); + } + xfer += iprot->readListEnd(); + } + this->__isset.definition_level_histograms = true; + } else { + xfer += iprot->skip(ftype); + } + break; default: xfer += iprot->skip(ftype); break; @@ -7398,10 +7784,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast(this->null_pages.size())); - std::vector ::const_iterator _iter267; - for (_iter267 = this->null_pages.begin(); _iter267 != this->null_pages.end(); ++_iter267) + std::vector ::const_iterator _iter299; + for (_iter299 = this->null_pages.begin(); _iter299 != this->null_pages.end(); ++_iter299) { - xfer += oprot->writeBool((*_iter267)); + xfer += oprot->writeBool((*_iter299)); } xfer += oprot->writeListEnd(); } @@ -7410,10 +7796,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->min_values.size())); - std::vector ::const_iterator _iter268; - for (_iter268 = this->min_values.begin(); _iter268 != this->min_values.end(); ++_iter268) + std::vector ::const_iterator _iter300; + for (_iter300 = this->min_values.begin(); _iter300 != this->min_values.end(); ++_iter300) { - xfer += oprot->writeBinary((*_iter268)); + xfer += oprot->writeBinary((*_iter300)); } xfer += oprot->writeListEnd(); } @@ -7422,10 +7808,10 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast(this->max_values.size())); - std::vector ::const_iterator _iter269; - for (_iter269 = this->max_values.begin(); _iter269 != this->max_values.end(); ++_iter269) + std::vector ::const_iterator _iter301; + for (_iter301 = this->max_values.begin(); _iter301 != this->max_values.end(); ++_iter301) { - xfer += oprot->writeBinary((*_iter269)); + xfer += oprot->writeBinary((*_iter301)); } xfer += oprot->writeListEnd(); } @@ -7439,10 +7825,36 @@ uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->null_counts.size())); - std::vector ::const_iterator _iter270; - for (_iter270 = this->null_counts.begin(); _iter270 != this->null_counts.end(); ++_iter270) + std::vector ::const_iterator _iter302; + for (_iter302 = this->null_counts.begin(); _iter302 != this->null_counts.end(); ++_iter302) + { + xfer += oprot->writeI64((*_iter302)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.repetition_level_histograms) { + xfer += oprot->writeFieldBegin("repetition_level_histograms", ::apache::thrift::protocol::T_LIST, 6); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->repetition_level_histograms.size())); + std::vector ::const_iterator _iter303; + for (_iter303 = this->repetition_level_histograms.begin(); _iter303 != this->repetition_level_histograms.end(); ++_iter303) { - xfer += oprot->writeI64((*_iter270)); + xfer += oprot->writeI64((*_iter303)); + } + xfer += oprot->writeListEnd(); + } + xfer += oprot->writeFieldEnd(); + } + if (this->__isset.definition_level_histograms) { + xfer += oprot->writeFieldBegin("definition_level_histograms", ::apache::thrift::protocol::T_LIST, 7); + { + xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast(this->definition_level_histograms.size())); + std::vector ::const_iterator _iter304; + for (_iter304 = this->definition_level_histograms.begin(); _iter304 != this->definition_level_histograms.end(); ++_iter304) + { + xfer += oprot->writeI64((*_iter304)); } xfer += oprot->writeListEnd(); } @@ -7460,41 +7872,51 @@ void swap(ColumnIndex &a, ColumnIndex &b) { swap(a.max_values, b.max_values); swap(a.boundary_order, b.boundary_order); swap(a.null_counts, b.null_counts); + swap(a.repetition_level_histograms, b.repetition_level_histograms); + swap(a.definition_level_histograms, b.definition_level_histograms); swap(a.__isset, b.__isset); } -ColumnIndex::ColumnIndex(const ColumnIndex& other271) { - null_pages = other271.null_pages; - min_values = other271.min_values; - max_values = other271.max_values; - boundary_order = other271.boundary_order; - null_counts = other271.null_counts; - __isset = other271.__isset; -} -ColumnIndex::ColumnIndex(ColumnIndex&& other272) noexcept { - null_pages = std::move(other272.null_pages); - min_values = std::move(other272.min_values); - max_values = std::move(other272.max_values); - boundary_order = other272.boundary_order; - null_counts = std::move(other272.null_counts); - __isset = other272.__isset; -} -ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other273) { - null_pages = other273.null_pages; - min_values = other273.min_values; - max_values = other273.max_values; - boundary_order = other273.boundary_order; - null_counts = other273.null_counts; - __isset = other273.__isset; +ColumnIndex::ColumnIndex(const ColumnIndex& other305) { + null_pages = other305.null_pages; + min_values = other305.min_values; + max_values = other305.max_values; + boundary_order = other305.boundary_order; + null_counts = other305.null_counts; + repetition_level_histograms = other305.repetition_level_histograms; + definition_level_histograms = other305.definition_level_histograms; + __isset = other305.__isset; +} +ColumnIndex::ColumnIndex(ColumnIndex&& other306) noexcept { + null_pages = std::move(other306.null_pages); + min_values = std::move(other306.min_values); + max_values = std::move(other306.max_values); + boundary_order = other306.boundary_order; + null_counts = std::move(other306.null_counts); + repetition_level_histograms = std::move(other306.repetition_level_histograms); + definition_level_histograms = std::move(other306.definition_level_histograms); + __isset = other306.__isset; +} +ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other307) { + null_pages = other307.null_pages; + min_values = other307.min_values; + max_values = other307.max_values; + boundary_order = other307.boundary_order; + null_counts = other307.null_counts; + repetition_level_histograms = other307.repetition_level_histograms; + definition_level_histograms = other307.definition_level_histograms; + __isset = other307.__isset; return *this; } -ColumnIndex& ColumnIndex::operator=(ColumnIndex&& other274) noexcept { - null_pages = std::move(other274.null_pages); - min_values = std::move(other274.min_values); - max_values = std::move(other274.max_values); - boundary_order = other274.boundary_order; - null_counts = std::move(other274.null_counts); - __isset = other274.__isset; +ColumnIndex& ColumnIndex::operator=(ColumnIndex&& other308) noexcept { + null_pages = std::move(other308.null_pages); + min_values = std::move(other308.min_values); + max_values = std::move(other308.max_values); + boundary_order = other308.boundary_order; + null_counts = std::move(other308.null_counts); + repetition_level_histograms = std::move(other308.repetition_level_histograms); + definition_level_histograms = std::move(other308.definition_level_histograms); + __isset = other308.__isset; return *this; } void ColumnIndex::printTo(std::ostream& out) const { @@ -7505,6 +7927,8 @@ void ColumnIndex::printTo(std::ostream& out) const { out << ", " << "max_values=" << to_string(max_values); out << ", " << "boundary_order=" << to_string(boundary_order); out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "")); + out << ", " << "repetition_level_histograms="; (__isset.repetition_level_histograms ? (out << to_string(repetition_level_histograms)) : (out << "")); + out << ", " << "definition_level_histograms="; (__isset.definition_level_histograms ? (out << to_string(definition_level_histograms)) : (out << "")); out << ")"; } @@ -7624,30 +8048,30 @@ void swap(AesGcmV1 &a, AesGcmV1 &b) { swap(a.__isset, b.__isset); } -AesGcmV1::AesGcmV1(const AesGcmV1& other275) { - aad_prefix = other275.aad_prefix; - aad_file_unique = other275.aad_file_unique; - supply_aad_prefix = other275.supply_aad_prefix; - __isset = other275.__isset; +AesGcmV1::AesGcmV1(const AesGcmV1& other309) { + aad_prefix = other309.aad_prefix; + aad_file_unique = other309.aad_file_unique; + supply_aad_prefix = other309.supply_aad_prefix; + __isset = other309.__isset; } -AesGcmV1::AesGcmV1(AesGcmV1&& other276) noexcept { - aad_prefix = std::move(other276.aad_prefix); - aad_file_unique = std::move(other276.aad_file_unique); - supply_aad_prefix = other276.supply_aad_prefix; - __isset = other276.__isset; +AesGcmV1::AesGcmV1(AesGcmV1&& other310) noexcept { + aad_prefix = std::move(other310.aad_prefix); + aad_file_unique = std::move(other310.aad_file_unique); + supply_aad_prefix = other310.supply_aad_prefix; + __isset = other310.__isset; } -AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other277) { - aad_prefix = other277.aad_prefix; - aad_file_unique = other277.aad_file_unique; - supply_aad_prefix = other277.supply_aad_prefix; - __isset = other277.__isset; +AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other311) { + aad_prefix = other311.aad_prefix; + aad_file_unique = other311.aad_file_unique; + supply_aad_prefix = other311.supply_aad_prefix; + __isset = other311.__isset; return *this; } -AesGcmV1& AesGcmV1::operator=(AesGcmV1&& other278) noexcept { - aad_prefix = std::move(other278.aad_prefix); - aad_file_unique = std::move(other278.aad_file_unique); - supply_aad_prefix = other278.supply_aad_prefix; - __isset = other278.__isset; +AesGcmV1& AesGcmV1::operator=(AesGcmV1&& other312) noexcept { + aad_prefix = std::move(other312.aad_prefix); + aad_file_unique = std::move(other312.aad_file_unique); + supply_aad_prefix = other312.supply_aad_prefix; + __isset = other312.__isset; return *this; } void AesGcmV1::printTo(std::ostream& out) const { @@ -7775,30 +8199,30 @@ void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) { swap(a.__isset, b.__isset); } -AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other279) { - aad_prefix = other279.aad_prefix; - aad_file_unique = other279.aad_file_unique; - supply_aad_prefix = other279.supply_aad_prefix; - __isset = other279.__isset; +AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other313) { + aad_prefix = other313.aad_prefix; + aad_file_unique = other313.aad_file_unique; + supply_aad_prefix = other313.supply_aad_prefix; + __isset = other313.__isset; } -AesGcmCtrV1::AesGcmCtrV1(AesGcmCtrV1&& other280) noexcept { - aad_prefix = std::move(other280.aad_prefix); - aad_file_unique = std::move(other280.aad_file_unique); - supply_aad_prefix = other280.supply_aad_prefix; - __isset = other280.__isset; +AesGcmCtrV1::AesGcmCtrV1(AesGcmCtrV1&& other314) noexcept { + aad_prefix = std::move(other314.aad_prefix); + aad_file_unique = std::move(other314.aad_file_unique); + supply_aad_prefix = other314.supply_aad_prefix; + __isset = other314.__isset; } -AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other281) { - aad_prefix = other281.aad_prefix; - aad_file_unique = other281.aad_file_unique; - supply_aad_prefix = other281.supply_aad_prefix; - __isset = other281.__isset; +AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other315) { + aad_prefix = other315.aad_prefix; + aad_file_unique = other315.aad_file_unique; + supply_aad_prefix = other315.supply_aad_prefix; + __isset = other315.__isset; return *this; } -AesGcmCtrV1& AesGcmCtrV1::operator=(AesGcmCtrV1&& other282) noexcept { - aad_prefix = std::move(other282.aad_prefix); - aad_file_unique = std::move(other282.aad_file_unique); - supply_aad_prefix = other282.supply_aad_prefix; - __isset = other282.__isset; +AesGcmCtrV1& AesGcmCtrV1::operator=(AesGcmCtrV1&& other316) noexcept { + aad_prefix = std::move(other316.aad_prefix); + aad_file_unique = std::move(other316.aad_file_unique); + supply_aad_prefix = other316.supply_aad_prefix; + __isset = other316.__isset; return *this; } void AesGcmCtrV1::printTo(std::ostream& out) const { @@ -7907,26 +8331,26 @@ void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) { swap(a.__isset, b.__isset); } -EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other283) { - AES_GCM_V1 = other283.AES_GCM_V1; - AES_GCM_CTR_V1 = other283.AES_GCM_CTR_V1; - __isset = other283.__isset; +EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other317) { + AES_GCM_V1 = other317.AES_GCM_V1; + AES_GCM_CTR_V1 = other317.AES_GCM_CTR_V1; + __isset = other317.__isset; } -EncryptionAlgorithm::EncryptionAlgorithm(EncryptionAlgorithm&& other284) noexcept { - AES_GCM_V1 = std::move(other284.AES_GCM_V1); - AES_GCM_CTR_V1 = std::move(other284.AES_GCM_CTR_V1); - __isset = other284.__isset; +EncryptionAlgorithm::EncryptionAlgorithm(EncryptionAlgorithm&& other318) noexcept { + AES_GCM_V1 = std::move(other318.AES_GCM_V1); + AES_GCM_CTR_V1 = std::move(other318.AES_GCM_CTR_V1); + __isset = other318.__isset; } -EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other285) { - AES_GCM_V1 = other285.AES_GCM_V1; - AES_GCM_CTR_V1 = other285.AES_GCM_CTR_V1; - __isset = other285.__isset; +EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other319) { + AES_GCM_V1 = other319.AES_GCM_V1; + AES_GCM_CTR_V1 = other319.AES_GCM_CTR_V1; + __isset = other319.__isset; return *this; } -EncryptionAlgorithm& EncryptionAlgorithm::operator=(EncryptionAlgorithm&& other286) noexcept { - AES_GCM_V1 = std::move(other286.AES_GCM_V1); - AES_GCM_CTR_V1 = std::move(other286.AES_GCM_CTR_V1); - __isset = other286.__isset; +EncryptionAlgorithm& EncryptionAlgorithm::operator=(EncryptionAlgorithm&& other320) noexcept { + AES_GCM_V1 = std::move(other320.AES_GCM_V1); + AES_GCM_CTR_V1 = std::move(other320.AES_GCM_CTR_V1); + __isset = other320.__isset; return *this; } void EncryptionAlgorithm::printTo(std::ostream& out) const { @@ -8026,14 +8450,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->schema.clear(); - uint32_t _size287; - ::apache::thrift::protocol::TType _etype290; - xfer += iprot->readListBegin(_etype290, _size287); - this->schema.resize(_size287); - uint32_t _i291; - for (_i291 = 0; _i291 < _size287; ++_i291) + uint32_t _size321; + ::apache::thrift::protocol::TType _etype324; + xfer += iprot->readListBegin(_etype324, _size321); + this->schema.resize(_size321); + uint32_t _i325; + for (_i325 = 0; _i325 < _size321; ++_i325) { - xfer += this->schema[_i291].read(iprot); + xfer += this->schema[_i325].read(iprot); } xfer += iprot->readListEnd(); } @@ -8054,14 +8478,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->row_groups.clear(); - uint32_t _size292; - ::apache::thrift::protocol::TType _etype295; - xfer += iprot->readListBegin(_etype295, _size292); - this->row_groups.resize(_size292); - uint32_t _i296; - for (_i296 = 0; _i296 < _size292; ++_i296) + uint32_t _size326; + ::apache::thrift::protocol::TType _etype329; + xfer += iprot->readListBegin(_etype329, _size326); + this->row_groups.resize(_size326); + uint32_t _i330; + for (_i330 = 0; _i330 < _size326; ++_i330) { - xfer += this->row_groups[_i296].read(iprot); + xfer += this->row_groups[_i330].read(iprot); } xfer += iprot->readListEnd(); } @@ -8074,14 +8498,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->key_value_metadata.clear(); - uint32_t _size297; - ::apache::thrift::protocol::TType _etype300; - xfer += iprot->readListBegin(_etype300, _size297); - this->key_value_metadata.resize(_size297); - uint32_t _i301; - for (_i301 = 0; _i301 < _size297; ++_i301) + uint32_t _size331; + ::apache::thrift::protocol::TType _etype334; + xfer += iprot->readListBegin(_etype334, _size331); + this->key_value_metadata.resize(_size331); + uint32_t _i335; + for (_i335 = 0; _i335 < _size331; ++_i335) { - xfer += this->key_value_metadata[_i301].read(iprot); + xfer += this->key_value_metadata[_i335].read(iprot); } xfer += iprot->readListEnd(); } @@ -8102,14 +8526,14 @@ uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) { if (ftype == ::apache::thrift::protocol::T_LIST) { { this->column_orders.clear(); - uint32_t _size302; - ::apache::thrift::protocol::TType _etype305; - xfer += iprot->readListBegin(_etype305, _size302); - this->column_orders.resize(_size302); - uint32_t _i306; - for (_i306 = 0; _i306 < _size302; ++_i306) + uint32_t _size336; + ::apache::thrift::protocol::TType _etype339; + xfer += iprot->readListBegin(_etype339, _size336); + this->column_orders.resize(_size336); + uint32_t _i340; + for (_i340 = 0; _i340 < _size336; ++_i340) { - xfer += this->column_orders[_i306].read(iprot); + xfer += this->column_orders[_i340].read(iprot); } xfer += iprot->readListEnd(); } @@ -8166,10 +8590,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->schema.size())); - std::vector ::const_iterator _iter307; - for (_iter307 = this->schema.begin(); _iter307 != this->schema.end(); ++_iter307) + std::vector ::const_iterator _iter341; + for (_iter341 = this->schema.begin(); _iter341 != this->schema.end(); ++_iter341) { - xfer += (*_iter307).write(oprot); + xfer += (*_iter341).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8182,10 +8606,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->row_groups.size())); - std::vector ::const_iterator _iter308; - for (_iter308 = this->row_groups.begin(); _iter308 != this->row_groups.end(); ++_iter308) + std::vector ::const_iterator _iter342; + for (_iter342 = this->row_groups.begin(); _iter342 != this->row_groups.end(); ++_iter342) { - xfer += (*_iter308).write(oprot); + xfer += (*_iter342).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8195,10 +8619,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->key_value_metadata.size())); - std::vector ::const_iterator _iter309; - for (_iter309 = this->key_value_metadata.begin(); _iter309 != this->key_value_metadata.end(); ++_iter309) + std::vector ::const_iterator _iter343; + for (_iter343 = this->key_value_metadata.begin(); _iter343 != this->key_value_metadata.end(); ++_iter343) { - xfer += (*_iter309).write(oprot); + xfer += (*_iter343).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8213,10 +8637,10 @@ uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7); { xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast(this->column_orders.size())); - std::vector ::const_iterator _iter310; - for (_iter310 = this->column_orders.begin(); _iter310 != this->column_orders.end(); ++_iter310) + std::vector ::const_iterator _iter344; + for (_iter344 = this->column_orders.begin(); _iter344 != this->column_orders.end(); ++_iter344) { - xfer += (*_iter310).write(oprot); + xfer += (*_iter344).write(oprot); } xfer += oprot->writeListEnd(); } @@ -8251,54 +8675,54 @@ void swap(FileMetaData &a, FileMetaData &b) { swap(a.__isset, b.__isset); } -FileMetaData::FileMetaData(const FileMetaData& other311) { - version = other311.version; - schema = other311.schema; - num_rows = other311.num_rows; - row_groups = other311.row_groups; - key_value_metadata = other311.key_value_metadata; - created_by = other311.created_by; - column_orders = other311.column_orders; - encryption_algorithm = other311.encryption_algorithm; - footer_signing_key_metadata = other311.footer_signing_key_metadata; - __isset = other311.__isset; -} -FileMetaData::FileMetaData(FileMetaData&& other312) noexcept { - version = other312.version; - schema = std::move(other312.schema); - num_rows = other312.num_rows; - row_groups = std::move(other312.row_groups); - key_value_metadata = std::move(other312.key_value_metadata); - created_by = std::move(other312.created_by); - column_orders = std::move(other312.column_orders); - encryption_algorithm = std::move(other312.encryption_algorithm); - footer_signing_key_metadata = std::move(other312.footer_signing_key_metadata); - __isset = other312.__isset; -} -FileMetaData& FileMetaData::operator=(const FileMetaData& other313) { - version = other313.version; - schema = other313.schema; - num_rows = other313.num_rows; - row_groups = other313.row_groups; - key_value_metadata = other313.key_value_metadata; - created_by = other313.created_by; - column_orders = other313.column_orders; - encryption_algorithm = other313.encryption_algorithm; - footer_signing_key_metadata = other313.footer_signing_key_metadata; - __isset = other313.__isset; +FileMetaData::FileMetaData(const FileMetaData& other345) { + version = other345.version; + schema = other345.schema; + num_rows = other345.num_rows; + row_groups = other345.row_groups; + key_value_metadata = other345.key_value_metadata; + created_by = other345.created_by; + column_orders = other345.column_orders; + encryption_algorithm = other345.encryption_algorithm; + footer_signing_key_metadata = other345.footer_signing_key_metadata; + __isset = other345.__isset; +} +FileMetaData::FileMetaData(FileMetaData&& other346) noexcept { + version = other346.version; + schema = std::move(other346.schema); + num_rows = other346.num_rows; + row_groups = std::move(other346.row_groups); + key_value_metadata = std::move(other346.key_value_metadata); + created_by = std::move(other346.created_by); + column_orders = std::move(other346.column_orders); + encryption_algorithm = std::move(other346.encryption_algorithm); + footer_signing_key_metadata = std::move(other346.footer_signing_key_metadata); + __isset = other346.__isset; +} +FileMetaData& FileMetaData::operator=(const FileMetaData& other347) { + version = other347.version; + schema = other347.schema; + num_rows = other347.num_rows; + row_groups = other347.row_groups; + key_value_metadata = other347.key_value_metadata; + created_by = other347.created_by; + column_orders = other347.column_orders; + encryption_algorithm = other347.encryption_algorithm; + footer_signing_key_metadata = other347.footer_signing_key_metadata; + __isset = other347.__isset; return *this; } -FileMetaData& FileMetaData::operator=(FileMetaData&& other314) noexcept { - version = other314.version; - schema = std::move(other314.schema); - num_rows = other314.num_rows; - row_groups = std::move(other314.row_groups); - key_value_metadata = std::move(other314.key_value_metadata); - created_by = std::move(other314.created_by); - column_orders = std::move(other314.column_orders); - encryption_algorithm = std::move(other314.encryption_algorithm); - footer_signing_key_metadata = std::move(other314.footer_signing_key_metadata); - __isset = other314.__isset; +FileMetaData& FileMetaData::operator=(FileMetaData&& other348) noexcept { + version = other348.version; + schema = std::move(other348.schema); + num_rows = other348.num_rows; + row_groups = std::move(other348.row_groups); + key_value_metadata = std::move(other348.key_value_metadata); + created_by = std::move(other348.created_by); + column_orders = std::move(other348.column_orders); + encryption_algorithm = std::move(other348.encryption_algorithm); + footer_signing_key_metadata = std::move(other348.footer_signing_key_metadata); + __isset = other348.__isset; return *this; } void FileMetaData::printTo(std::ostream& out) const { @@ -8414,26 +8838,26 @@ void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) { swap(a.__isset, b.__isset); } -FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other315) { - encryption_algorithm = other315.encryption_algorithm; - key_metadata = other315.key_metadata; - __isset = other315.__isset; +FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other349) { + encryption_algorithm = other349.encryption_algorithm; + key_metadata = other349.key_metadata; + __isset = other349.__isset; } -FileCryptoMetaData::FileCryptoMetaData(FileCryptoMetaData&& other316) noexcept { - encryption_algorithm = std::move(other316.encryption_algorithm); - key_metadata = std::move(other316.key_metadata); - __isset = other316.__isset; +FileCryptoMetaData::FileCryptoMetaData(FileCryptoMetaData&& other350) noexcept { + encryption_algorithm = std::move(other350.encryption_algorithm); + key_metadata = std::move(other350.key_metadata); + __isset = other350.__isset; } -FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other317) { - encryption_algorithm = other317.encryption_algorithm; - key_metadata = other317.key_metadata; - __isset = other317.__isset; +FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other351) { + encryption_algorithm = other351.encryption_algorithm; + key_metadata = other351.key_metadata; + __isset = other351.__isset; return *this; } -FileCryptoMetaData& FileCryptoMetaData::operator=(FileCryptoMetaData&& other318) noexcept { - encryption_algorithm = std::move(other318.encryption_algorithm); - key_metadata = std::move(other318.key_metadata); - __isset = other318.__isset; +FileCryptoMetaData& FileCryptoMetaData::operator=(FileCryptoMetaData&& other352) noexcept { + encryption_algorithm = std::move(other352.encryption_algorithm); + key_metadata = std::move(other352.key_metadata); + __isset = other352.__isset; return *this; } void FileCryptoMetaData::printTo(std::ostream& out) const { diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h index 199b4ae747667..9dc6794c4030b 100644 --- a/cpp/src/generated/parquet_types.h +++ b/cpp/src/generated/parquet_types.h @@ -1,5 +1,5 @@ /** - * Autogenerated by Thrift Compiler (0.18.1) + * Autogenerated by Thrift Compiler (0.19.0) * * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING * @generated @@ -345,6 +345,8 @@ std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val); std::string to_string(const BoundaryOrder::type& val); +class SizeStatistics; + class Statistics; class StringType; @@ -449,14 +451,121 @@ class FileMetaData; class FileCryptoMetaData; +typedef struct _SizeStatistics__isset { + _SizeStatistics__isset() : unencoded_byte_array_data_bytes(false), repetition_level_histogram(false), definition_level_histogram(false) {} + bool unencoded_byte_array_data_bytes :1; + bool repetition_level_histogram :1; + bool definition_level_histogram :1; +} _SizeStatistics__isset; + +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of data written. This is useful for readers to estimate + * how much memory is needed to reconstruct data in their memory model and for + * fine grained filter pushdown on nested structures (the histograms contained + * in this structure can help determine the number of nulls at a particular + * nesting level and maximum length of lists). + */ +class SizeStatistics : public virtual ::apache::thrift::TBase { + public: + + SizeStatistics(const SizeStatistics&); + SizeStatistics(SizeStatistics&&) noexcept; + SizeStatistics& operator=(const SizeStatistics&); + SizeStatistics& operator=(SizeStatistics&&) noexcept; + SizeStatistics() noexcept + : unencoded_byte_array_data_bytes(0) { + } + + virtual ~SizeStatistics() noexcept; + /** + * The number of physical bytes stored for BYTE_ARRAY data values assuming + * no encoding. This is exclusive of the bytes needed to store the length of + * each byte array. In other words, this field is equivalent to the `(size + * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + * written)`. To determine unencoded sizes of other types readers can use + * schema information multiplied by the number of non-null and null values. + * The number of null/non-null values can be inferred from the histograms + * below. + * + * For example, if a column chunk is dictionary-encoded with dictionary + * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + * then this value for that data page should be 7 (1 + 1 + 2 + 3). + * + * This field should only be set for types that use BYTE_ARRAY as their + * physical type. + */ + int64_t unencoded_byte_array_data_bytes; + /** + * When present, there is expected to be one element corresponding to each + * repetition (i.e. size=max repetition_level+1) where each element + * represents the number of times the repetition level was observed in the + * data. + * + * This field may be omitted if max_repetition_level is 0 without loss + * of information. + * + */ + std::vector repetition_level_histogram; + /** + * Same as repetition_level_histogram except for definition levels. + * + * This field may be omitted if max_definition_level is 0 or 1 without + * loss of information. + * + */ + std::vector definition_level_histogram; + + _SizeStatistics__isset __isset; + + void __set_unencoded_byte_array_data_bytes(const int64_t val); + + void __set_repetition_level_histogram(const std::vector & val); + + void __set_definition_level_histogram(const std::vector & val); + + bool operator == (const SizeStatistics & rhs) const + { + if (__isset.unencoded_byte_array_data_bytes != rhs.__isset.unencoded_byte_array_data_bytes) + return false; + else if (__isset.unencoded_byte_array_data_bytes && !(unencoded_byte_array_data_bytes == rhs.unencoded_byte_array_data_bytes)) + return false; + if (__isset.repetition_level_histogram != rhs.__isset.repetition_level_histogram) + return false; + else if (__isset.repetition_level_histogram && !(repetition_level_histogram == rhs.repetition_level_histogram)) + return false; + if (__isset.definition_level_histogram != rhs.__isset.definition_level_histogram) + return false; + else if (__isset.definition_level_histogram && !(definition_level_histogram == rhs.definition_level_histogram)) + return false; + return true; + } + bool operator != (const SizeStatistics &rhs) const { + return !(*this == rhs); + } + + bool operator < (const SizeStatistics & ) const; + + uint32_t read(::apache::thrift::protocol::TProtocol* iprot) override; + uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const override; + + virtual void printTo(std::ostream& out) const; +}; + +void swap(SizeStatistics &a, SizeStatistics &b); + +std::ostream& operator<<(std::ostream& out, const SizeStatistics& obj); + typedef struct _Statistics__isset { - _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {} + _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false), is_max_value_exact(false), is_min_value_exact(false) {} bool max :1; bool min :1; bool null_count :1; bool distinct_count :1; bool max_value :1; bool min_value :1; + bool is_max_value_exact :1; + bool is_min_value_exact :1; } _Statistics__isset; /** @@ -476,7 +585,9 @@ class Statistics : public virtual ::apache::thrift::TBase { null_count(0), distinct_count(0), max_value(), - min_value() { + min_value(), + is_max_value_exact(0), + is_min_value_exact(0) { } virtual ~Statistics() noexcept; @@ -504,13 +615,27 @@ class Statistics : public virtual ::apache::thrift::TBase { */ int64_t distinct_count; /** - * Min and max values for the column, determined by its ColumnOrder. + * Lower and upper bound values for the column, determined by its ColumnOrder. + * + * These may be the actual minimum and maximum values found on a page or column + * chunk, but can also be (more compact) values that do not exist on a page or + * column chunk. For example, instead of storing "Blart Versenwald III", a writer + * may set min_value="B", max_value="C". Such more compact values must still be + * valid values within the column's logical type. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. */ std::string max_value; std::string min_value; + /** + * If true, max_value is the actual maximum value for a column + */ + bool is_max_value_exact; + /** + * If true, min_value is the actual minimum value for a column + */ + bool is_min_value_exact; _Statistics__isset __isset; @@ -526,6 +651,10 @@ class Statistics : public virtual ::apache::thrift::TBase { void __set_min_value(const std::string& val); + void __set_is_max_value_exact(const bool val); + + void __set_is_min_value_exact(const bool val); + bool operator == (const Statistics & rhs) const { if (__isset.max != rhs.__isset.max) @@ -552,6 +681,14 @@ class Statistics : public virtual ::apache::thrift::TBase { return false; else if (__isset.min_value && !(min_value == rhs.min_value)) return false; + if (__isset.is_max_value_exact != rhs.__isset.is_max_value_exact) + return false; + else if (__isset.is_max_value_exact && !(is_max_value_exact == rhs.is_max_value_exact)) + return false; + if (__isset.is_min_value_exact != rhs.__isset.is_min_value_exact) + return false; + else if (__isset.is_min_value_exact && !(is_min_value_exact == rhs.is_min_value_exact)) + return false; return true; } bool operator != (const Statistics &rhs) const { @@ -848,6 +985,9 @@ std::ostream& operator<<(std::ostream& out, const NullType& obj); /** * Decimal logical type annotation * + * Scale must be zero or a positive integer less than or equal to the precision. + * Precision must be a non-zero positive integer. + * * To maintain forward-compatibility in v1, implementations using this logical * type must also set scale and precision on the annotated SchemaElement. * @@ -1670,7 +1810,7 @@ class DataPageHeader : public virtual ::apache::thrift::TBase { */ Encoding::type repetition_level_encoding; /** - * Optional statistics for the data in this page* + * Optional statistics for the data in this page * */ Statistics statistics; @@ -1877,15 +2017,15 @@ class DataPageHeaderV2 : public virtual ::apache::thrift::TBase { */ Encoding::type encoding; /** - * length of the definition levels + * Length of the definition levels */ int32_t definition_levels_byte_length; /** - * length of the repetition levels + * Length of the repetition levels */ int32_t repetition_levels_byte_length; /** - * whether the values are compressed. + * Whether the values are compressed. * Which means the section of the page between * definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) * is compressed with the compression_codec. @@ -1893,7 +2033,7 @@ class DataPageHeaderV2 : public virtual ::apache::thrift::TBase { */ bool is_compressed; /** - * optional statistics for the data in this page * + * Optional statistics for the data in this page * */ Statistics statistics; @@ -2603,13 +2743,15 @@ void swap(PageEncodingStats &a, PageEncodingStats &b); std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj); typedef struct _ColumnMetaData__isset { - _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {} + _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false), bloom_filter_length(false), size_statistics(false) {} bool key_value_metadata :1; bool index_page_offset :1; bool dictionary_page_offset :1; bool statistics :1; bool encoding_stats :1; bool bloom_filter_offset :1; + bool bloom_filter_length :1; + bool size_statistics :1; } _ColumnMetaData__isset; /** @@ -2631,7 +2773,8 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), - bloom_filter_offset(0) { + bloom_filter_offset(0), + bloom_filter_length(0) { } virtual ~ColumnMetaData() noexcept; @@ -2699,6 +2842,21 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { * Byte offset from beginning of file to Bloom filter data. * */ int64_t bloom_filter_offset; + /** + * Size of Bloom filter data including the serialized header, in bytes. + * Added in 2.10 so readers may not read this field from old files and + * it can be obtained after the BloomFilterHeader has been deserialized. + * Writers should write this field so readers can read the bloom filter + * in a single I/O. + */ + int32_t bloom_filter_length; + /** + * Optional statistics to help estimate total memory when converted to in-memory + * representations. The histograms contained in these statistics can + * also be useful in some cases for more fine-grained nullability/list length + * filter pushdown. + */ + SizeStatistics size_statistics; _ColumnMetaData__isset __isset; @@ -2730,6 +2888,10 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { void __set_bloom_filter_offset(const int64_t val); + void __set_bloom_filter_length(const int32_t val); + + void __set_size_statistics(const SizeStatistics& val); + bool operator == (const ColumnMetaData & rhs) const { if (!(type == rhs.type)) @@ -2772,6 +2934,14 @@ class ColumnMetaData : public virtual ::apache::thrift::TBase { return false; else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset)) return false; + if (__isset.bloom_filter_length != rhs.__isset.bloom_filter_length) + return false; + else if (__isset.bloom_filter_length && !(bloom_filter_length == rhs.bloom_filter_length)) + return false; + if (__isset.size_statistics != rhs.__isset.size_statistics) + return false; + else if (__isset.size_statistics && !(size_statistics == rhs.size_statistics)) + return false; return true; } bool operator != (const ColumnMetaData &rhs) const { @@ -3403,6 +3573,10 @@ void swap(PageLocation &a, PageLocation &b); std::ostream& operator<<(std::ostream& out, const PageLocation& obj); +typedef struct _OffsetIndex__isset { + _OffsetIndex__isset() : unencoded_byte_array_data_bytes(false) {} + bool unencoded_byte_array_data_bytes :1; +} _OffsetIndex__isset; class OffsetIndex : public virtual ::apache::thrift::TBase { public: @@ -3420,13 +3594,28 @@ class OffsetIndex : public virtual ::apache::thrift::TBase { * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. */ std::vector page_locations; + /** + * Unencoded/uncompressed size for BYTE_ARRAY types. + * + * See documention for unencoded_byte_array_data_bytes in SizeStatistics for + * more details on this field. + */ + std::vector unencoded_byte_array_data_bytes; + + _OffsetIndex__isset __isset; void __set_page_locations(const std::vector & val); + void __set_unencoded_byte_array_data_bytes(const std::vector & val); + bool operator == (const OffsetIndex & rhs) const { if (!(page_locations == rhs.page_locations)) return false; + if (__isset.unencoded_byte_array_data_bytes != rhs.__isset.unencoded_byte_array_data_bytes) + return false; + else if (__isset.unencoded_byte_array_data_bytes && !(unencoded_byte_array_data_bytes == rhs.unencoded_byte_array_data_bytes)) + return false; return true; } bool operator != (const OffsetIndex &rhs) const { @@ -3446,8 +3635,10 @@ void swap(OffsetIndex &a, OffsetIndex &b); std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj); typedef struct _ColumnIndex__isset { - _ColumnIndex__isset() : null_counts(false) {} + _ColumnIndex__isset() : null_counts(false), repetition_level_histograms(false), definition_level_histograms(false) {} bool null_counts :1; + bool repetition_level_histograms :1; + bool definition_level_histograms :1; } _ColumnIndex__isset; /** @@ -3499,6 +3690,25 @@ class ColumnIndex : public virtual ::apache::thrift::TBase { * A list containing the number of null values for each page * */ std::vector null_counts; + /** + * Contains repetition level histograms for each page + * concatenated together. The repetition_level_histogram field on + * SizeStatistics contains more details. + * + * When present the length should always be (number of pages * + * (max_repetition_level + 1)) elements. + * + * Element 0 is the first element of the histogram for the first page. + * Element (max_repetition_level + 1) is the first element of the histogram + * for the second page. + * + */ + std::vector repetition_level_histograms; + /** + * Same as repetition_level_histograms except for definitions levels. + * + */ + std::vector definition_level_histograms; _ColumnIndex__isset __isset; @@ -3512,6 +3722,10 @@ class ColumnIndex : public virtual ::apache::thrift::TBase { void __set_null_counts(const std::vector & val); + void __set_repetition_level_histograms(const std::vector & val); + + void __set_definition_level_histograms(const std::vector & val); + bool operator == (const ColumnIndex & rhs) const { if (!(null_pages == rhs.null_pages)) @@ -3526,6 +3740,14 @@ class ColumnIndex : public virtual ::apache::thrift::TBase { return false; else if (__isset.null_counts && !(null_counts == rhs.null_counts)) return false; + if (__isset.repetition_level_histograms != rhs.__isset.repetition_level_histograms) + return false; + else if (__isset.repetition_level_histograms && !(repetition_level_histograms == rhs.repetition_level_histograms)) + return false; + if (__isset.definition_level_histograms != rhs.__isset.definition_level_histograms) + return false; + else if (__isset.definition_level_histograms && !(definition_level_histograms == rhs.definition_level_histograms)) + return false; return true; } bool operator != (const ColumnIndex &rhs) const { diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 9c6f7a044b589..a2f3498190f93 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2410,7 +2410,7 @@ TEST(TestArrowReadWrite, WaitCoalescedReads) { ASSERT_EQ(actual_batch->num_rows(), num_rows); } -// Use coalesced reads and non-coaleasced reads for different column chunks. +// Use coalesced reads and non-coalesced reads for different column chunks. TEST(TestArrowReadWrite, CoalescedReadsAndNonCoalescedReads) { constexpr int num_columns = 5; constexpr int num_rows = 128; @@ -2918,7 +2918,7 @@ TEST(ArrowReadWrite, DecimalStats) { auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array}); std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, /*row_grop_size=*/100, + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, /*row_group_size=*/100, default_arrow_writer_properties(), &buffer)); std::unique_ptr reader; diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 919c97f4323b6..b1e4742aaaba4 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -66,7 +66,7 @@ // 3. In order to keep repetition/definition level populated the algorithm is lazy // in assigning repetition levels. The algorithm tracks whether it is currently // in the middle of a list by comparing the lengths of repetition/definition levels. -// If it is currently in the middle of a list the the number of repetition levels +// If it is currently in the middle of a list the number of repetition levels // populated will be greater than definition levels (the start of a List requires // adding the first element). If there are equal numbers of definition and repetition // levels populated this indicates a list is waiting to be started and the next list @@ -141,7 +141,7 @@ int64_t LazyNullCount(const Array& array) { return array.data()->null_count.load bool LazyNoNulls(const Array& array) { int64_t null_count = LazyNullCount(array); return null_count == 0 || - // kUnkownNullCount comparison is needed to account + // kUnknownNullCount comparison is needed to account // for null arrays. (null_count == ::arrow::kUnknownNullCount && array.null_bitmap_data() == nullptr); @@ -312,7 +312,7 @@ struct NullableTerminalNode { // at least one other node). // // Type parameters: -// |RangeSelector| - A strategy for determine the the range of the child node to +// |RangeSelector| - A strategy for determine the range of the child node to // process. // this varies depending on the type of list (int32_t* offsets, int64_t* offsets of // fixed. @@ -830,6 +830,8 @@ class PathBuilder { // Types not yet supported in Parquet. NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(RunEndEncoded); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); #undef NOT_IMPLEMENTED_VISIT std::vector& paths() { return paths_; } diff --git a/cpp/src/parquet/arrow/path_internal.h b/cpp/src/parquet/arrow/path_internal.h index c5b7fdfdac378..50d2bf24291a1 100644 --- a/cpp/src/parquet/arrow/path_internal.h +++ b/cpp/src/parquet/arrow/path_internal.h @@ -116,7 +116,7 @@ class PARQUET_EXPORT MultipathLevelBuilder { /// /// \param[in] array The array to process. /// \param[in] array_field_nullable Whether the algorithm should consider - /// the the array column as nullable (as determined by its type's parent + /// the array column as nullable (as determined by its type's parent /// field). /// \param[in, out] context for use when allocating memory, etc. /// \param[out] write_leaf_callback Callback to receive results. @@ -129,7 +129,7 @@ class PARQUET_EXPORT MultipathLevelBuilder { /// /// \param[in] array The array to process. /// \param[in] array_field_nullable Whether the algorithm should consider - /// the the array column as nullable (as determined by its type's parent + /// the array column as nullable (as determined by its type's parent /// field). static ::arrow::Result> Make( const ::arrow::Array& array, bool array_field_nullable); diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index fb9c404247f3b..0145e889ddaf7 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -381,7 +381,7 @@ TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeNulls) { /*rep_levels=*/std::vector{0, 0, 2, 2, 1, 1, 0, 2}); } -TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeNullsSomeEmptys) { +TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeNullsSomeEmpties) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); auto list_field = field("list", list(entries), /*nullable=*/true); auto nested_list_type = list(list_field); @@ -442,7 +442,7 @@ TEST_F(MultipathLevelBuilderTest, TripleNestedListsAllPresent) { }); } -TEST_F(MultipathLevelBuilderTest, TripleNestedListsWithSomeNullsSomeEmptys) { +TEST_F(MultipathLevelBuilderTest, TripleNestedListsWithSomeNullsSomeEmpties) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); auto list_field = field("list", list(entries), /*nullable=*/true); auto nested_list_type = list(list_field); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index f5484f131eb07..e1a6f44119f96 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -229,7 +229,7 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, } // The user implicitly wants timestamp data to retain its original time units, - // however the Arrow seconds time unit can not be represented (annotated) in + // however the Arrow seconds time unit cannot be represented (annotated) in // any version of Parquet and so must be coerced to milliseconds. if (type.unit() == ::arrow::TimeUnit::SECOND) { *logical_type = diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index bb75cce084097..e319f712b515d 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -48,7 +48,7 @@ Result> MakeArrowInt(const LogicalType& logical_type) return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32(); default: return Status::TypeError(logical_type.ToString(), - " can not annotate physical type Int32"); + " cannot annotate physical type Int32"); } } @@ -59,7 +59,7 @@ Result> MakeArrowInt64(const LogicalType& logical_typ return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64(); default: return Status::TypeError(logical_type.ToString(), - " can not annotate physical type Int64"); + " cannot annotate physical type Int64"); } } @@ -70,7 +70,7 @@ Result> MakeArrowTime32(const LogicalType& logical_ty return ::arrow::time32(::arrow::TimeUnit::MILLI); default: return Status::TypeError(logical_type.ToString(), - " can not annotate physical type Time32"); + " cannot annotate physical type Time32"); } } @@ -83,7 +83,7 @@ Result> MakeArrowTime64(const LogicalType& logical_ty return ::arrow::time64(::arrow::TimeUnit::NANO); default: return Status::TypeError(logical_type.ToString(), - " can not annotate physical type Time64"); + " cannot annotate physical type Time64"); } } diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 300a6d8e054cc..07c627d5eda67 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -294,7 +294,7 @@ class FileWriterImpl : public FileWriter { for (int i = 0; i < schema_->num_fields(); ++i) { // Explicitly create each ArrowWriteContext object to avoid unintentional // call of the copy constructor. Otherwise, the buffers in the type of - // sharad_ptr will be shared among all contexts. + // shared_ptr will be shared among all contexts. parallel_column_write_contexts_.emplace_back(pool, arrow_properties_.get()); } } diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index 427e73b9e6428..5201d2f6c514a 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -105,16 +105,24 @@ static ::arrow::Status ValidateBloomFilterHeader( } BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize( - const ReaderProperties& properties, ArrowInputStream* input) { - // NOTE: we don't know the bloom filter header size upfront, and we can't rely on - // InputStream::Peek() which isn't always implemented. Therefore, we must first - // Read() with an upper bound estimate of the header size, then once we know - // the bloom filter data size, we can Read() the exact number of remaining data bytes. + const ReaderProperties& properties, ArrowInputStream* input, + std::optional bloom_filter_length) { ThriftDeserializer deserializer(properties); format::BloomFilterHeader header; + int64_t bloom_filter_header_read_size = 0; + if (bloom_filter_length.has_value()) { + bloom_filter_header_read_size = bloom_filter_length.value(); + } else { + // NOTE: we don't know the bloom filter header size upfront without + // bloom_filter_length, and we can't rely on InputStream::Peek() which isn't always + // implemented. Therefore, we must first Read() with an upper bound estimate of the + // header size, then once we know the bloom filter data size, we can Read() the exact + // number of remaining data bytes. + bloom_filter_header_read_size = kBloomFilterHeaderSizeGuess; + } // Read and deserialize bloom filter header - PARQUET_ASSIGN_OR_THROW(auto header_buf, input->Read(kBloomFilterHeaderSizeGuess)); + PARQUET_ASSIGN_OR_THROW(auto header_buf, input->Read(bloom_filter_header_read_size)); // This gets used, then set by DeserializeThriftMsg uint32_t header_size = static_cast(header_buf->size()); try { @@ -136,6 +144,14 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize( bloom_filter.Init(header_buf->data() + header_size, bloom_filter_size); return bloom_filter; } + if (bloom_filter_length && *bloom_filter_length != bloom_filter_size + header_size) { + // We know the bloom filter data size, but the real size is different. + std::stringstream ss; + ss << "Bloom filter length (" << bloom_filter_length.value() + << ") does not match the actual bloom filter (size: " + << bloom_filter_size + header_size << ")."; + throw ParquetException(ss.str()); + } // We have read a part of the bloom filter already, copy it to the target buffer // and read the remaining part from the InputStream. auto buffer = AllocateBuffer(properties.memory_pool(), bloom_filter_size); diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index e8ef5c0bd60db..909563d013fed 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -310,10 +310,13 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { /// a Bloom filter from a parquet filter. /// /// @param properties The parquet reader properties. - /// @param input_stream The input stream from which to construct the Bloom filter. + /// @param input_stream The input stream from which to construct the bloom filter. + /// @param bloom_filter_length The length of the serialized bloom filter including + /// header. /// @return The BlockSplitBloomFilter. - static BlockSplitBloomFilter Deserialize(const ReaderProperties& properties, - ArrowInputStream* input_stream); + static BlockSplitBloomFilter Deserialize( + const ReaderProperties& properties, ArrowInputStream* input_stream, + std::optional bloom_filter_length = std::nullopt); private: inline void InsertHashImpl(uint64_t hash); diff --git a/cpp/src/parquet/bloom_filter_reader.cc b/cpp/src/parquet/bloom_filter_reader.cc index 4e27a940c2f5e..3518d2ba1eb76 100644 --- a/cpp/src/parquet/bloom_filter_reader.cc +++ b/cpp/src/parquet/bloom_filter_reader.cc @@ -63,9 +63,20 @@ std::unique_ptr RowGroupBloomFilterReaderImpl::GetColumnBloomFilter if (file_size <= *bloom_filter_offset) { throw ParquetException("file size less or equal than bloom offset"); } + std::optional bloom_filter_length = col_chunk->bloom_filter_length(); + if (bloom_filter_length.has_value()) { + if (*bloom_filter_length < 0) { + throw ParquetException("bloom_filter_length less than 0"); + } + if (*bloom_filter_length + *bloom_filter_offset > file_size) { + throw ParquetException( + "bloom filter length + bloom filter offset greater than file size"); + } + } auto stream = ::arrow::io::RandomAccessFile::GetStream( input_, *bloom_filter_offset, file_size - *bloom_filter_offset); - auto bloom_filter = BlockSplitBloomFilter::Deserialize(properties_, stream->get()); + auto bloom_filter = + BlockSplitBloomFilter::Deserialize(properties_, stream->get(), bloom_filter_length); return std::make_unique(std::move(bloom_filter)); } diff --git a/cpp/src/parquet/bloom_filter_reader_test.cc b/cpp/src/parquet/bloom_filter_reader_test.cc index e297ab7045120..f732b4a8e22b7 100644 --- a/cpp/src/parquet/bloom_filter_reader_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include "parquet/bloom_filter.h" @@ -25,31 +26,41 @@ namespace parquet::test { TEST(BloomFilterReader, ReadBloomFilter) { - std::string dir_string(parquet::test::get_data_dir()); - std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet"; - auto reader = ParquetFileReader::OpenFile(path, false); - auto file_metadata = reader->metadata(); - EXPECT_FALSE(file_metadata->is_encryption_algorithm_set()); - auto& bloom_filter_reader = reader->GetBloomFilterReader(); - auto row_group_0 = bloom_filter_reader.RowGroup(0); - ASSERT_NE(nullptr, row_group_0); - EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException); - auto bloom_filter = row_group_0->GetColumnBloomFilter(0); - ASSERT_NE(nullptr, bloom_filter); - EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException); + std::vector files = {"data_index_bloom_encoding_stats.parquet", + "data_index_bloom_encoding_with_length.parquet"}; + for (const auto& test_file : files) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/" + test_file; + auto reader = ParquetFileReader::OpenFile(path, /*memory_map=*/false); + auto file_metadata = reader->metadata(); + EXPECT_FALSE(file_metadata->is_encryption_algorithm_set()); + auto& bloom_filter_reader = reader->GetBloomFilterReader(); + auto row_group_0 = bloom_filter_reader.RowGroup(0); + ASSERT_NE(nullptr, row_group_0); + EXPECT_THROW_THAT( + [&]() { bloom_filter_reader.RowGroup(1); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("Invalid row group ordinal"))); + auto bloom_filter = row_group_0->GetColumnBloomFilter(0); + ASSERT_NE(nullptr, bloom_filter); + EXPECT_THROW_THAT([&]() { row_group_0->GetColumnBloomFilter(1); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr( + "Invalid column index at column ordinal"))); - // assert exists - { - std::string_view sv = "Hello"; - ByteArray ba{sv}; - EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); - } + // assert exists + { + std::string_view sv = "Hello"; + ByteArray ba{sv}; + EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + } - // no exists - { - std::string_view sv = "NOT_EXISTS"; - ByteArray ba{sv}; - EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + // no exists + { + std::string_view sv = "NOT_EXISTS"; + ByteArray ba{sv}; + EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + } } } diff --git a/cpp/src/parquet/bloom_filter_test.cc b/cpp/src/parquet/bloom_filter_test.cc index b7d93bce4d37b..ff83b97302274 100644 --- a/cpp/src/parquet/bloom_filter_test.cc +++ b/cpp/src/parquet/bloom_filter_test.cc @@ -107,24 +107,26 @@ TEST(BasicTest, TestBloomFilter) { // Deserialize Bloom filter from memory ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); - ::arrow::io::BufferReader source(buffer); ReaderProperties reader_properties; - BlockSplitBloomFilter de_bloom = - BlockSplitBloomFilter::Deserialize(reader_properties, &source); - - // Lookup previously inserted values - for (const auto v : kIntInserts) { - EXPECT_TRUE(de_bloom.FindHash(de_bloom.Hash(v))); - } - for (const auto v : kFloatInserts) { - EXPECT_TRUE(de_bloom.FindHash(de_bloom.Hash(v))); + for (std::optional bloom_filter_length : + std::vector>{std::nullopt, buffer->size()}) { + ::arrow::io::BufferReader source(buffer); + BlockSplitBloomFilter de_bloom = BlockSplitBloomFilter::Deserialize( + reader_properties, &source, bloom_filter_length); + // Lookup previously inserted values + for (const auto v : kIntInserts) { + EXPECT_TRUE(de_bloom.FindHash(de_bloom.Hash(v))); + } + for (const auto v : kFloatInserts) { + EXPECT_TRUE(de_bloom.FindHash(de_bloom.Hash(v))); + } + false_positives = 0; + for (const auto v : kNegativeIntLookups) { + false_positives += de_bloom.FindHash(de_bloom.Hash(v)); + } + EXPECT_LE(false_positives, 2); } - false_positives = 0; - for (const auto v : kNegativeIntLookups) { - false_positives += de_bloom.FindHash(de_bloom.Hash(v)); - } - EXPECT_LE(false_positives, 2); } } diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc index bed7e06786e70..e2cc24502af5d 100644 --- a/cpp/src/parquet/column_reader_test.cc +++ b/cpp/src/parquet/column_reader_test.cc @@ -269,7 +269,7 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) { } // Tests skipping around page boundaries. -TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundries) { +TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundaries) { int levels_per_page = 100; int num_pages = 7; max_def_level_ = 0; @@ -372,7 +372,7 @@ TEST_F(TestPrimitiveReader, TestSkipRepeatedField) { InitReader(&descr); Int32Reader* reader = static_cast(reader_.get()); - // Vecotrs to hold read values, definition levels, and repetition levels. + // Vectors to hold read values, definition levels, and repetition levels. std::vector read_vals(4, -1); std::vector read_defs(4, -1); std::vector read_reps(4, -1); @@ -902,7 +902,7 @@ TEST_P(RecordReaderPrimitiveTypeTest, ReadRequiredRepeated) { } // Tests reading a nullable repeated field. Tests reading null values at -// differnet levels and reading an empty list. +// different levels and reading an empty list. TEST_P(RecordReaderPrimitiveTypeTest, ReadNullableRepeated) { NodePtr column = GroupNode::Make( "p", Repetition::OPTIONAL, @@ -1240,7 +1240,7 @@ TEST_P(RecordReaderPrimitiveTypeTest, SkipRepeatedConsumeBufferFirst) { ASSERT_EQ(records_skipped, 12); CheckState(/*values_written=*/0, /*null_count=*/0, /*levels_written=*/12, /*levels_position=*/0); - // Everthing is empty because we reset the reader before this skip. + // Everything is empty because we reset the reader before this skip. CheckReadValues(/*expected_values=*/{}, /*expected_def_levels=*/{}, /*expected_rep_levels=*/{}); } @@ -1395,7 +1395,7 @@ TEST_P(RecordReaderPrimitiveTypeTest, SkipPartialRecord) { } } -INSTANTIATE_TEST_SUITE_P(RecordReaderPrimitveTypeTests, RecordReaderPrimitiveTypeTest, +INSTANTIATE_TEST_SUITE_P(RecordReaderPrimitiveTypeTests, RecordReaderPrimitiveTypeTest, ::testing::Values(/*read_dense_for_nullable=*/true, false), testing::PrintToStringParamName()); @@ -1608,8 +1608,8 @@ TEST_P(ByteArrayRecordReaderTest, ReadAndSkipOptional) { } // Tests reading and skipping an optional FLBA field. -// The binary readers only differ in DeocdeDense and DecodeSpaced functions, so -// testing optional is sufficient in excercising those code paths. +// The binary readers only differ in DecodeDense and DecodeSpaced functions, so +// testing optional is sufficient in exercising those code paths. TEST_P(FLBARecordReaderTest, ReadAndSkipOptional) { MakeRecordReader(/*levels_per_page=*/90, /*num_pages=*/1, /*FLBA_type_length=*/4); @@ -1710,7 +1710,7 @@ TEST_P(RecordReaderStressTest, StressTest) { // The levels_index and values_index are over the original vectors that have // all the rep/def values for all the records. In the following loop, we will - // read/skip a numebr of records and Reset the reader after each iteration. + // read/skip a number of records and Reset the reader after each iteration. // This is on-par with how the record reader is used. size_t levels_index = 0; size_t values_index = 0; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a7e7b2f93e174..12b2837fbfd1e 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -75,7 +75,7 @@ namespace parquet { namespace { -// Visitor that exracts the value buffer from a FlatArray at a given offset. +// Visitor that extracts the value buffer from a FlatArray at a given offset. struct ValueBufferSlicer { template ::arrow::enable_if_base_binary Visit( @@ -131,6 +131,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Union); NOT_IMPLEMENTED_VISIT(List); NOT_IMPLEMENTED_VISIT(LargeList); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); NOT_IMPLEMENTED_VISIT(Struct); NOT_IMPLEMENTED_VISIT(FixedSizeList); NOT_IMPLEMENTED_VISIT(Dictionary); @@ -1311,7 +1313,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< END_PARQUET_CATCH_EXCEPTIONS } - int64_t EstimatedBufferedValueBytes() const override { + int64_t estimated_buffered_value_bytes() const override { return current_encoder_->EstimatedDataEncodedSize(); } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 88a42acc2f706..a278670fa81c6 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -175,6 +175,9 @@ class PARQUET_EXPORT ColumnWriter { /// total_bytes_written(). virtual int64_t total_compressed_bytes_written() const = 0; + /// \brief Estimated size of the values that are not written to a page yet. + virtual int64_t estimated_buffered_value_bytes() const = 0; + /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; @@ -239,9 +242,6 @@ class TypedColumnWriter : public ColumnWriter { virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) = 0; - - // Estimated size of the values that are not written to a page yet - virtual int64_t EstimatedBufferedValueBytes() const = 0; }; using BoolWriter = TypedColumnWriter; diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 0d354f5c1ac0c..59fc848d7fd57 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -308,8 +308,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { ColumnProperties column_properties(encoding, compression, enable_dictionary, enable_statistics); column_properties.set_codec_options(codec_options); - std::shared_ptr> writer = this->BuildWriter( - num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); + std::shared_ptr> writer = + this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0, + ParquetDataPageVersion::V1, enable_checksum); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); // The behaviour should be independent from the number of Close() calls writer->Close(); @@ -557,7 +558,7 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndBrotliCompression) { #endif -#ifdef ARROW_WITH_GZIP +#ifdef ARROW_WITH_ZLIB TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCompression) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::GZIP, false, false, LARGE_SIZE); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1bb487c20d3e2..9ad1ee6efc12a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -745,7 +745,7 @@ void DictEncoderImpl::Put(const ::arrow::Array& values) { template void AssertCanPutDictionary(DictEncoderImpl* encoder, const ::arrow::Array& dict) { if (dict.null_count() > 0) { - throw ParquetException("Inserted dictionary cannot cannot contain nulls"); + throw ParquetException("Inserted dictionary cannot contain nulls"); } if (encoder->num_entries() > 0) { @@ -1196,24 +1196,40 @@ struct ArrowBinaryHelper { chunk_space_remaining_(::arrow::kBinaryMemoryLimit - acc_->builder->value_data_length()) {} + // Prepare will reserve the number of entries remaining in the current chunk. + // If estimated_data_length is provided, it will also reserve the estimated data length, + // and the caller should better call `UnsafeAppend` instead of `Append` to avoid + // double-checking the data length. Status Prepare(std::optional estimated_data_length = {}) { RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); if (estimated_data_length.has_value()) { RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_data_length, ::arrow::kBinaryMemoryLimit))); + std::min(*estimated_data_length, this->chunk_space_remaining_))); } return Status::OK(); } + Status PrepareNextInput(int64_t next_value_length) { + if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { + // This element would exceed the capacity of a chunk + RETURN_NOT_OK(PushChunk()); + RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); + } + return Status::OK(); + } + + // If estimated_remaining_data_length is provided, it will also reserve the estimated + // data length, and the caller should better call `UnsafeAppend` instead of + // `Append` to avoid double-checking the data length. Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + int64_t estimated_remaining_data_length) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { // This element would exceed the capacity of a chunk RETURN_NOT_OK(PushChunk()); RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); - if (estimated_remaining_data_length.has_value()) { + if (estimated_remaining_data_length) { RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_remaining_data_length, chunk_space_remaining_))); + std::min(estimated_remaining_data_length, chunk_space_remaining_))); } } return Status::OK(); @@ -1271,8 +1287,10 @@ struct ArrowBinaryHelper { return acc_->Reserve(entries_remaining_); } + Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); } + Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + int64_t estimated_remaining_data_length) { return Status::OK(); } @@ -1915,6 +1933,9 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; ArrowBinaryHelper helper(out, num_values); + // The `len_` in the ByteArrayDictDecoder is the total length of the + // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve + // space for binary data. RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1983,7 +2004,10 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); - RETURN_NOT_OK(helper.Prepare(len_)); + // The `len_` in the ByteArrayDictDecoder is the total length of the + // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve + // space for binary data. + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 6cdfe37920200..de47bb7deb839 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -233,7 +233,7 @@ class DictEncoder : virtual public TypedEncoder { /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is /// assumed (without any boundschecking) that the indices reference - /// pre-existing dictionary values + /// preexisting dictionary values /// \param[in] indices the dictionary index values. Only Int32Array currently /// supported virtual void PutIndices(const ::arrow::Array& indices) = 0; diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 9861c317c80d9..ee581622c818f 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1634,7 +1634,7 @@ TYPED_TEST(TestDeltaBitPackEncoding, NonZeroPaddedMiniblockBitWidth) { } } -// Test that the DELTA_BINARY_PACKED encoding works properply in the presence of values +// Test that the DELTA_BINARY_PACKED encoding works properly in the presence of values // that will cause integer overflow (see GH-37939). TYPED_TEST(TestDeltaBitPackEncoding, DeltaBitPackedWrapping) { using T = typename TypeParam::c_type; diff --git a/cpp/src/parquet/encryption/test_encryption_util.h b/cpp/src/parquet/encryption/test_encryption_util.h index 86aa0ff07cf84..9bfc774278dde 100644 --- a/cpp/src/parquet/encryption/test_encryption_util.h +++ b/cpp/src/parquet/encryption/test_encryption_util.h @@ -88,7 +88,7 @@ std::unordered_map BuildKeyMap(const char* const* colu const char* footer_key); // The result of this function will be used to set into EncryptionConfiguration -// as colum keys. +// as column keys. std::string BuildColumnKeyMapping(); // FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 4377e714a240b..6b3c7062fcc4a 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -91,7 +91,7 @@ static std::vector GetSupportedCodecTypes() { codec_types.push_back(Compression::BROTLI); #endif -#ifdef ARROW_WITH_GZIP +#ifdef ARROW_WITH_ZLIB codec_types.push_back(Compression::GZIP); #endif diff --git a/cpp/src/parquet/file_serialize_test.cc b/cpp/src/parquet/file_serialize_test.cc index 85bfd1c5147a8..62e1965418076 100644 --- a/cpp/src/parquet/file_serialize_test.cc +++ b/cpp/src/parquet/file_serialize_test.cc @@ -334,7 +334,7 @@ TYPED_TEST(TestSerialize, SmallFileBrotli) { } #endif -#ifdef ARROW_WITH_GZIP +#ifdef ARROW_WITH_ZLIB TYPED_TEST(TestSerialize, SmallFileGzip) { ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::GZIP)); } diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 2c6f628319fc4..31de95be41c47 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -100,7 +100,7 @@ struct PARQUET_EXPORT LevelInfo { } } - /// Incremetns level for a optional node. + /// Increments level for a optional node. void IncrementOptional() { def_level++; } /// Increments levels for the repeated node. Returns @@ -112,7 +112,7 @@ struct PARQUET_EXPORT LevelInfo { // to distinguish between an empty list and a list with an item in it. ++rep_level; ++def_level; - // For levels >= repeated_ancenstor_def_level it indicates the list was + // For levels >= repeated_ancestor_def_level it indicates the list was // non-null and had at least one element. This is important // for later decoding because we need to add a slot for these // values. for levels < current_def_level no slots are added diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index b12680089b839..4513573ef22f1 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -127,7 +127,7 @@ TEST(DefLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) { level_info.repeated_ancestor_def_level = 1; level_info.def_level = 2; level_info.rep_level = 1; - // All zeros should be ignored, ones should be unset in the bitmp and 2 should be set. + // All zeros should be ignored, ones should be unset in the bitmap and 2 should be set. std::vector def_levels = {0, 0, 0, 2, 2, 1, 0, 2}; DefLevelsToBitmap(def_levels.data(), def_levels.size(), level_info, &io); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index f43187c2dd4e5..d651ea5db0f18 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -286,6 +286,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return std::nullopt; } + inline std::optional bloom_filter_length() const { + if (column_metadata_->__isset.bloom_filter_length) { + return column_metadata_->bloom_filter_length; + } + return std::nullopt; + } + inline bool has_dictionary_page() const { return column_metadata_->__isset.dictionary_page_offset; } @@ -399,6 +406,10 @@ std::optional ColumnChunkMetaData::bloom_filter_offset() const { return impl_->bloom_filter_offset(); } +std::optional ColumnChunkMetaData::bloom_filter_length() const { + return impl_->bloom_filter_length(); +} + bool ColumnChunkMetaData::has_dictionary_page() const { return impl_->has_dictionary_page(); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 6609cff48bac2..e47c45ff0492a 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -168,6 +168,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { const std::vector& encodings() const; const std::vector& encoding_stats() const; std::optional bloom_filter_offset() const; + std::optional bloom_filter_length() const; bool has_dictionary_page() const; int64_t dictionary_page_offset() const; int64_t data_page_offset() const; diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index ec99af17f05a1..afda4c6064b36 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -428,7 +428,7 @@ class PageIndexReaderImpl : public PageIndexReader { /// Reader properties used to deserialize thrift object. const ReaderProperties& properties_; - /// File-level decrypter. + /// File-level decryptor. InternalFileDecryptor* file_decryptor_; /// Coalesced read ranges of page index of row groups that have been suggested by diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index f2ed77cb97c3b..d45c59cab223f 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -231,13 +231,13 @@ class PARQUET_EXPORT PageIndexReader { const std::vector& column_indices, const PageIndexSelection& selection) = 0; - /// \brief Advise the reader page index of these row groups will not be read any more. + /// \brief Advise the reader page index of these row groups will not be read anymore. /// /// The PageIndexReader implementation has the opportunity to cancel any prefetch or /// release resource that are related to these row groups. /// /// \param[in] row_group_indices list of row group ordinal that whose page index will - /// not be accessed any more. + /// not be accessed anymore. virtual void WillNotNeed(const std::vector& row_group_indices) = 0; /// \brief Determine the column index and offset index ranges for the given row group. @@ -263,7 +263,7 @@ class PARQUET_EXPORT ColumnIndexBuilder { /// \brief Add statistics of a data page. /// /// If the ColumnIndexBuilder has seen any corrupted statistics, it will - /// not update statistics any more. + /// not update statistics anymore. /// /// \param stats Page statistics in the encoded form. virtual void AddPage(const EncodedStatistics& stats) = 0; diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index d802166be66e8..a1883d335aa23 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -20,7 +20,6 @@ /** * File format description for the parquet file format */ - cpp_include "parquet/windows_compatibility.h" namespace cpp parquet.format namespace java org.apache.parquet.format @@ -193,6 +192,52 @@ enum FieldRepetitionType { REPEATED = 2; } +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of data written. This is useful for readers to estimate + * how much memory is needed to reconstruct data in their memory model and for + * fine grained filter pushdown on nested structures (the histograms contained + * in this structure can help determine the number of nulls at a particular + * nesting level and maximum length of lists). + */ +struct SizeStatistics { + /** + * The number of physical bytes stored for BYTE_ARRAY data values assuming + * no encoding. This is exclusive of the bytes needed to store the length of + * each byte array. In other words, this field is equivalent to the `(size + * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + * written)`. To determine unencoded sizes of other types readers can use + * schema information multiplied by the number of non-null and null values. + * The number of null/non-null values can be inferred from the histograms + * below. + * + * For example, if a column chunk is dictionary-encoded with dictionary + * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + * then this value for that data page should be 7 (1 + 1 + 2 + 3). + * + * This field should only be set for types that use BYTE_ARRAY as their + * physical type. + */ + 1: optional i64 unencoded_byte_array_data_bytes; + /** + * When present, there is expected to be one element corresponding to each + * repetition (i.e. size=max repetition_level+1) where each element + * represents the number of times the repetition level was observed in the + * data. + * + * This field may be omitted if max_repetition_level is 0 without loss + * of information. + **/ + 2: optional list repetition_level_histogram; + /** + * Same as repetition_level_histogram except for definition levels. + * + * This field may be omitted if max_definition_level is 0 or 1 without + * loss of information. + **/ + 3: optional list definition_level_histogram; +} + /** * Statistics per row group and per page * All fields are optional. @@ -218,13 +263,23 @@ struct Statistics { /** count of distinct values occurring */ 4: optional i64 distinct_count; /** - * Min and max values for the column, determined by its ColumnOrder. + * Lower and upper bound values for the column, determined by its ColumnOrder. + * + * These may be the actual minimum and maximum values found on a page or column + * chunk, but can also be (more compact) values that do not exist on a page or + * column chunk. For example, instead of storing "Blart Versenwald III", a writer + * may set min_value="B", max_value="C". Such more compact values must still be + * valid values within the column's logical type. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. */ 5: optional binary max_value; 6: optional binary min_value; + /** If true, max_value is the actual maximum value for a column */ + 7: optional bool is_max_value_exact; + /** If true, min_value is the actual minimum value for a column */ + 8: optional bool is_min_value_exact; } /** Empty structs to use as logical type annotations */ @@ -234,7 +289,7 @@ struct MapType {} // see LogicalTypes.md struct ListType {} // see LogicalTypes.md struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 struct DateType {} // allowed for INT32 -struct Float16Type{} // allowed for FIXED[2], must encode raw FLOAT16 bytes +struct Float16Type {} // allowed for FIXED[2], must encoded raw FLOAT16 bytes /** * Logical type to annotate a column that is always null. @@ -248,6 +303,9 @@ struct NullType {} // allowed for any physical type, only null values stored /** * Decimal logical type annotation * + * Scale must be zero or a positive integer less than or equal to the precision. + * Precision must be a non-zero positive integer. + * * To maintain forward-compatibility in v1, implementations using this logical * type must also set scale and precision on the annotated SchemaElement. * @@ -530,7 +588,7 @@ struct DataPageHeader { /** Encoding used for repetition levels **/ 4: required Encoding repetition_level_encoding; - /** Optional statistics for the data in this page**/ + /** Optional statistics for the data in this page **/ 5: optional Statistics statistics; } @@ -572,19 +630,19 @@ struct DataPageHeaderV2 { // repetition levels and definition levels are always using RLE (without size in it) - /** length of the definition levels */ + /** Length of the definition levels */ 5: required i32 definition_levels_byte_length; - /** length of the repetition levels */ + /** Length of the repetition levels */ 6: required i32 repetition_levels_byte_length; - /** whether the values are compressed. + /** Whether the values are compressed. Which means the section of the page between definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) is compressed with the compression_codec. If missing it is considered compressed */ - 7: optional bool is_compressed = 1; + 7: optional bool is_compressed = true; - /** optional statistics for the data in this page **/ + /** Optional statistics for the data in this page **/ 8: optional Statistics statistics; } @@ -597,11 +655,11 @@ union BloomFilterAlgorithm { } /** Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash - * algorithm. It uses 64 bits version of xxHash. + * algorithm. It uses 64 bits version of xxHash. **/ struct XxHash {} -/** +/** * The hash function used in Bloom filter. This function takes the hash of a column value * using plain encoding. **/ @@ -757,6 +815,22 @@ struct ColumnMetaData { /** Byte offset from beginning of file to Bloom filter data. **/ 14: optional i64 bloom_filter_offset; + + /** Size of Bloom filter data including the serialized header, in bytes. + * Added in 2.10 so readers may not read this field from old files and + * it can be obtained after the BloomFilterHeader has been deserialized. + * Writers should write this field so readers can read the bloom filter + * in a single I/O. + */ + 15: optional i32 bloom_filter_length; + + /** + * Optional statistics to help estimate total memory when converted to in-memory + * representations. The histograms contained in these statistics can + * also be useful in some cases for more fine-grained nullability/list length + * filter pushdown. + */ + 16: optional SizeStatistics size_statistics; } struct EncryptionWithFooterKey { @@ -765,7 +839,7 @@ struct EncryptionWithFooterKey { struct EncryptionWithColumnKey { /** Column path in schema **/ 1: required list path_in_schema - + /** Retrieval metadata of column encryption key **/ 2: optional binary key_metadata } @@ -804,7 +878,7 @@ struct ColumnChunk { /** Crypto metadata of encrypted columns **/ 8: optional ColumnCryptoMetaData crypto_metadata - + /** Encrypted column metadata for this chunk **/ 9: optional binary encrypted_column_metadata } @@ -897,7 +971,7 @@ union ColumnOrder { * - If the min is +0, the row group may contain -0 values as well. * - If the max is -0, the row group may contain +0 values as well. * - When looking for NaN values, min and max should be ignored. - * + * * When writing statistics the following rules should be followed: * - NaNs should not be written to min or max statistics fields. * - If the computed max value is zero (whether negative or positive), @@ -931,6 +1005,13 @@ struct OffsetIndex { * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. */ 1: required list page_locations + /** + * Unencoded/uncompressed size for BYTE_ARRAY types. + * + * See documention for unencoded_byte_array_data_bytes in SizeStatistics for + * more details on this field. + */ + 2: optional list unencoded_byte_array_data_bytes } /** @@ -970,6 +1051,25 @@ struct ColumnIndex { /** A list containing the number of null values for each page **/ 5: optional list null_counts + + /** + * Contains repetition level histograms for each page + * concatenated together. The repetition_level_histogram field on + * SizeStatistics contains more details. + * + * When present the length should always be (number of pages * + * (max_repetition_level + 1)) elements. + * + * Element 0 is the first element of the histogram for the first page. + * Element (max_repetition_level + 1) is the first element of the histogram + * for the second page. + **/ + 6: optional list repetition_level_histograms; + /** + * Same as repetition_level_histograms except for definitions levels. + **/ + 7: optional list definition_level_histograms; + } struct AesGcmV1 { @@ -978,7 +1078,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -990,7 +1090,7 @@ struct AesGcmCtrV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 2c81abb9eee79..f11397ab96ed8 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -314,6 +314,34 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); + if (column_chunk->bloom_filter_offset()) { + // Output BloomFilter {offset, length} + stream << "\", BloomFilter {" + << "\"offset\": \"" << column_chunk->bloom_filter_offset().value(); + if (column_chunk->bloom_filter_length()) { + stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value(); + } + stream << "\"}"; + } + + if (column_chunk->GetColumnIndexLocation()) { + auto location = column_chunk->GetColumnIndexLocation().value(); + // Output ColumnIndex {offset, length} + stream << "\", ColumnIndex {" + << "\"offset\": \"" << location.offset; + stream << "\", \"length\": \"" << location.length; + stream << "\"}"; + } + + if (column_chunk->GetOffsetIndexLocation()) { + auto location = column_chunk->GetOffsetIndexLocation().value(); + // Output OffsetIndex {offset, length} + stream << "\", OffsetIndex {" + << "\"offset\": \"" << location.offset; + stream << "\", \"length\": \"" << location.length; + stream << "\"}"; + } + // end of a ColumnChunk stream << "\" }"; c1++; diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 8fe12d3de0b6c..5223158e5f4f9 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -88,7 +88,7 @@ std::string lz4_raw_compressed_larger() { return data_file("lz4_raw_compressed_larger.parquet"); } -std::string overflow_i16_page_oridinal() { +std::string overflow_i16_page_ordinal() { return data_file("overflow_i16_page_cnt.parquet"); } @@ -116,6 +116,10 @@ std::string rle_dict_uncompressed_corrupt_checksum() { return data_file("rle-dict-uncompressed-corrupt-checksum.parquet"); } +std::string concatenated_gzip_members() { + return data_file("concatenated_gzip_members.parquet"); +} + // TODO: Assert on definition and repetition levels template void AssertColumnValues(std::shared_ptr> col, int64_t batch_size, @@ -425,7 +429,7 @@ TEST_F(TestAllTypesPlain, TestBatchRead) { ASSERT_FALSE(col->HasNext()); } -TEST_F(TestAllTypesPlain, RowGroupColumnBoundchecking) { +TEST_F(TestAllTypesPlain, RowGroupColumnBoundsChecking) { // Part of PARQUET-1857 ASSERT_THROW(reader_->RowGroup(reader_->metadata()->num_row_groups()), ParquetException); @@ -778,6 +782,28 @@ TEST_F(TestCheckDataPageCrc, CorruptDict) { } } +TEST(TestGzipMembersRead, TwoConcatenatedMembers) { +#ifndef ARROW_WITH_ZLIB + GTEST_SKIP() << "Test requires Zlib compression"; +#endif + auto file_reader = ParquetFileReader::OpenFile(concatenated_gzip_members(), + /*memory_map=*/false); + auto col_reader = std::dynamic_pointer_cast>( + file_reader->RowGroup(0)->Column(0)); + int64_t num_values = 0; + int64_t num_repdef = 0; + std::vector reps(1024); + std::vector defs(1024); + std::vector vals(1024); + + num_repdef = + col_reader->ReadBatch(1024, defs.data(), reps.data(), vals.data(), &num_values); + EXPECT_EQ(num_repdef, 513); + for (int64_t i = 0; i < num_repdef; i++) { + EXPECT_EQ(i + 1, vals[i]); + } +} + TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) { // PARQUET-816. Some files generated by older Parquet implementations may // contain malformed data page metadata, and we can successfully decode them @@ -1285,7 +1311,7 @@ INSTANTIATE_TEST_SUITE_P(Lz4CodecTests, TestCodec, ::testing::ValuesIn(test_code // INT16_MAX pages. (GH-15074). TEST(TestFileReader, TestOverflowInt16PageOrdinal) { ReaderProperties reader_props; - auto file_reader = ParquetFileReader::OpenFile(overflow_i16_page_oridinal(), + auto file_reader = ParquetFileReader::OpenFile(overflow_i16_page_ordinal(), /*memory_map=*/false, reader_props); auto metadata_ptr = file_reader->metadata(); EXPECT_EQ(1, metadata_ptr->num_row_groups()); diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 5437fa2208a53..4ddeef9e83975 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -255,14 +255,14 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_); } else { error << logical_type_->ToString(); - error << " can not be applied to primitive type "; + error << " cannot be applied to primitive type "; error << TypeToString(physical_type); throw ParquetException(error.str()); } } else { error << "Nested logical type "; error << logical_type_->ToString(); - error << " can not be applied to non-group node"; + error << " cannot be applied to non-group node"; throw ParquetException(error.str()); } } else { @@ -344,7 +344,7 @@ GroupNode::GroupNode(const std::string& name, Repetition::type repetition, std::stringstream error; error << "Logical type "; error << logical_type_->ToString(); - error << " can not be applied to group node"; + error << " cannot be applied to group node"; throw ParquetException(error.str()); } } else { diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index a1b5557497d9c..2532a8656e69f 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -908,7 +908,7 @@ static void ConfirmFactoryEquivalence( TEST(TestLogicalTypeConstruction, FactoryEquivalence) { // For each legacy converted type, ensure that the equivalent logical type object // can be obtained from either the base class's FromConvertedType() factory method or - // the logical type type class's Make() method (accessed via convenience methods on the + // the logical type class's Make() method (accessed via convenience methods on the // base class) and that these logical type objects are equivalent struct ConfirmFactoryEquivalenceArguments { @@ -1870,7 +1870,7 @@ class TestSchemaElementConstruction : public ::testing::Test { if (expect_logicalType_) { ASSERT_TRUE(element_->__isset.logicalType) << node_->logical_type()->ToString() - << " logical type unexpectedly failed to genverate a logicalType in the Thrift " + << " logical type unexpectedly failed to generate a logicalType in the Thrift " "intermediate object"; ASSERT_TRUE(check_logicalType_()) << node_->logical_type()->ToString() diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 37b245e0dd6c2..e54b94f1a861a 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -438,9 +438,9 @@ class TypedComparatorImpl return Helper::Compare(type_length_, a, b); } - bool Compare(const T& a, const T& b) override { return CompareInline(a, b); } + bool Compare(const T& a, const T& b) const override { return CompareInline(a, b); } - std::pair GetMinMax(const T* values, int64_t length) override { + std::pair GetMinMax(const T* values, int64_t length) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); @@ -457,7 +457,7 @@ class TypedComparatorImpl std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset) override { + int64_t valid_bits_offset) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); @@ -477,7 +477,7 @@ class TypedComparatorImpl return {min, max}; } - std::pair GetMinMax(const ::arrow::Array& values) override { + std::pair GetMinMax(const ::arrow::Array& values) const override { ParquetException::NYI(values.type()->ToString()); } @@ -491,7 +491,7 @@ class TypedComparatorImpl template <> std::pair TypedComparatorImpl::GetMinMax(const int32_t* values, - int64_t length) { + int64_t length) const { DCHECK_GT(length, 0); const uint32_t* unsigned_values = reinterpret_cast(values); @@ -537,13 +537,13 @@ std::pair GetMinMaxBinaryHelper( template <> std::pair TypedComparatorImpl::GetMinMax( - const ::arrow::Array& values) { + const ::arrow::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } template <> std::pair TypedComparatorImpl::GetMinMax( - const ::arrow::Array& values) { + const ::arrow::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index ae6c1ca29b2f6..0d6ea9898f7ba 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -73,16 +73,16 @@ class TypedComparator : public Comparator { /// \brief Scalar comparison of two elements, return true if first /// is strictly less than the second - virtual bool Compare(const T& a, const T& b) = 0; + virtual bool Compare(const T& a, const T& b) const = 0; /// \brief Compute maximum and minimum elements in a batch of /// elements without any nulls - virtual std::pair GetMinMax(const T* values, int64_t length) = 0; + virtual std::pair GetMinMax(const T* values, int64_t length) const = 0; /// \brief Compute minimum and maximum elements from an Arrow array. Only /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY /// / arrow::BinaryArray - virtual std::pair GetMinMax(const ::arrow::Array& values) = 0; + virtual std::pair GetMinMax(const ::arrow::Array& values) const = 0; /// \brief Compute maximum and minimum elements in a batch of /// elements with accompanying bitmap indicating which elements are @@ -96,7 +96,7 @@ class TypedComparator : public Comparator { /// the first element in the sequence virtual std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset) = 0; + int64_t valid_bits_offset) const = 0; }; /// \brief Typed version of Comparator::Make @@ -205,7 +205,7 @@ class PARQUET_EXPORT Statistics { ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); /// \brief Create a new statistics instance given a column schema - /// definition and pre-existing state + /// definition and preexisting state /// \param[in] descr the column schema /// \param[in] encoded_min the encoded minimum value /// \param[in] encoded_max the encoded maximum value diff --git a/cpp/src/parquet/stream_writer.cc b/cpp/src/parquet/stream_writer.cc index 856436d701816..c578b7e527467 100644 --- a/cpp/src/parquet/stream_writer.cc +++ b/cpp/src/parquet/stream_writer.cc @@ -157,7 +157,7 @@ StreamWriter& StreamWriter::WriteVariableLength(const char* data_ptr, writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); } if (max_row_group_size_ > 0) { - row_group_size_ += writer->EstimatedBufferedValueBytes(); + row_group_size_ += writer->estimated_buffered_value_bytes(); } return *this; } @@ -178,7 +178,7 @@ StreamWriter& StreamWriter::WriteFixedLength(const char* data_ptr, std::size_t d writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr); } if (max_row_group_size_ > 0) { - row_group_size_ += writer->EstimatedBufferedValueBytes(); + row_group_size_ += writer->estimated_buffered_value_bytes(); } return *this; } diff --git a/cpp/src/parquet/stream_writer.h b/cpp/src/parquet/stream_writer.h index f95d39fd1d504..7637cf7da245c 100644 --- a/cpp/src/parquet/stream_writer.h +++ b/cpp/src/parquet/stream_writer.h @@ -185,7 +185,7 @@ class PARQUET_EXPORT StreamWriter { writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v); if (max_row_group_size_ > 0) { - row_group_size_ += writer->EstimatedBufferedValueBytes(); + row_group_size_ += writer->estimated_buffered_value_bytes(); } return *this; } diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 89b685a64c311..d69d979223e88 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 89b685a64c3117b3023d8684af1f41400841db71 +Subproject commit d69d979223e883faef9dc6fe3cf573087243c28a diff --git a/csharp/examples/FlightClientExample/Program.cs b/csharp/examples/FlightClientExample/Program.cs index f0cf6e1e862ee..8a3c170da68dc 100644 --- a/csharp/examples/FlightClientExample/Program.cs +++ b/csharp/examples/FlightClientExample/Program.cs @@ -81,7 +81,7 @@ public static async Task Main(string[] args) Console.WriteLine($"Read batch from flight server: \n {batch}") ; } - // See available comands on this server + // See available commands on this server var action_stream = client.ListActions(); Console.WriteLine("Actions:"); while (await action_stream.ResponseStream.MoveNext()) diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 67b37e49c7dc5..1849bf11b7439 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamReader.cs b/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamReader.cs index 588127537dfeb..d21fb25f5c946 100644 --- a/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamReader.cs +++ b/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamReader.cs @@ -38,11 +38,11 @@ public abstract class FlightRecordBatchStreamReader : IAsyncStreamReader flightDataStream) { - _arrowReaderImplementation = new RecordBatcReaderImplementation(flightDataStream); + _arrowReaderImplementation = new RecordBatchReaderImplementation(flightDataStream); } public ValueTask Schema => _arrowReaderImplementation.ReadSchema(); @@ -53,7 +53,7 @@ internal ValueTask GetFlightDescriptor() } /// - /// Get the application metadata from the latest recieved record batch + /// Get the application metadata from the latest received record batch /// public IReadOnlyList ApplicationMetadata => _arrowReaderImplementation.ApplicationMetadata; diff --git a/csharp/src/Apache.Arrow.Flight/Internal/RecordBatcReaderImplementation.cs b/csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs similarity index 96% rename from csharp/src/Apache.Arrow.Flight/Internal/RecordBatcReaderImplementation.cs rename to csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs index 10d4d731eb9f7..be844ea58e404 100644 --- a/csharp/src/Apache.Arrow.Flight/Internal/RecordBatcReaderImplementation.cs +++ b/csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs @@ -25,13 +25,13 @@ namespace Apache.Arrow.Flight.Internal { - internal class RecordBatcReaderImplementation : ArrowReaderImplementation + internal class RecordBatchReaderImplementation : ArrowReaderImplementation { private readonly IAsyncStreamReader _flightDataStream; private FlightDescriptor _flightDescriptor; private readonly List _applicationMetadatas; - public RecordBatcReaderImplementation(IAsyncStreamReader streamReader) + public RecordBatchReaderImplementation(IAsyncStreamReader streamReader) { _flightDataStream = streamReader; _applicationMetadatas = new List(); diff --git a/csharp/src/Apache.Arrow.Flight/Internal/StreamReader.cs b/csharp/src/Apache.Arrow.Flight/Internal/StreamReader.cs index a2c3db3d340ce..b07509d9ac324 100644 --- a/csharp/src/Apache.Arrow.Flight/Internal/StreamReader.cs +++ b/csharp/src/Apache.Arrow.Flight/Internal/StreamReader.cs @@ -27,7 +27,7 @@ namespace Apache.Arrow.Flight.Internal /// This is a helper class that allows conversions from gRPC types to the Arrow types. /// It maintains the stream so data can be read as soon as possible. /// - /// In paramter from gRPC + /// In parameter from gRPC /// The arrow type returned internal class StreamReader : IAsyncStreamReader { diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 1eec449077479..62d5858fadeb2 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -16,7 +16,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index a7ddb14af6a10..1bd4035d5b9da 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -18,10 +18,11 @@ using System.Collections.Generic; using System.Runtime.CompilerServices; using Apache.Arrow.Memory; +using System.Collections; namespace Apache.Arrow { - public class BinaryArray : Array + public class BinaryArray : Array, IReadOnlyList { public class Builder : BuilderBase { @@ -366,5 +367,18 @@ public ReadOnlySpan GetBytes(int index, out bool isNull) return ValueBuffer.Span.Slice(ValueOffsets[index], GetValueLength(index)); } + + int IReadOnlyCollection.Count => Length; + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs index 0915338fe6a91..e9c5f8979e48f 100644 --- a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs @@ -16,11 +16,12 @@ using Apache.Arrow.Memory; using Apache.Arrow.Types; using System; +using System.Collections; using System.Collections.Generic; namespace Apache.Arrow { - public class BooleanArray: Array + public class BooleanArray: Array, IReadOnlyList { public class Builder : IArrowArrayBuilder { @@ -190,5 +191,19 @@ public bool GetBoolean(int index) ? (bool?)null : BitUtility.GetBit(ValueBuffer.Span, index + Offset); } + + int IReadOnlyCollection.Count => Length; + + bool? IReadOnlyList.this[int index] => GetValue(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetValue(index); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs index 23ad7356eb322..6ab4986f573e2 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; namespace Apache.Arrow { @@ -22,7 +23,10 @@ namespace Apache.Arrow /// The class holds an array of dates in the Date32 format, where each date is /// stored as the number of days since the dawn of (UNIX) time. /// - public class Date32Array : PrimitiveArray + public class Date32Array : PrimitiveArray, IReadOnlyList +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { private static readonly DateTime _epochDate = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Unspecified); #if NET6_0_OR_GREATER @@ -133,6 +137,30 @@ public Date32Array(ArrayData data) ? DateOnly.FromDayNumber(_epochDayNumber + value.Value) : default(DateOnly?); } + + int IReadOnlyCollection.Count => Length; + + DateOnly? IReadOnlyList.this[int index] => GetDateOnly(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateOnly(index); + }; + } #endif + + int IReadOnlyCollection.Count => Length; + + DateTime? IReadOnlyList.this[int index] => GetDateTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateTime(index); + }; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs index b0d42e27bbd23..43e698e10b25c 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; namespace Apache.Arrow { @@ -23,7 +24,10 @@ namespace Apache.Arrow /// stored as the number of milliseconds since the dawn of (UNIX) time, excluding leap seconds, in multiples of /// 86400000. /// - public class Date64Array: PrimitiveArray + public class Date64Array : PrimitiveArray, IReadOnlyList +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { private const long MillisecondsPerDay = 86400000; @@ -39,7 +43,7 @@ public Date64Array( /// public class Builder : DateArrayBuilder { - private class DateBuilder: PrimitiveArrayBuilder + private class DateBuilder : PrimitiveArrayBuilder { protected override Date64Array Build( ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, @@ -135,6 +139,30 @@ public Date64Array(ArrayData data) ? DateOnly.FromDateTime(DateTimeOffset.FromUnixTimeMilliseconds(value.Value).UtcDateTime) : default(DateOnly?); } + + int IReadOnlyCollection.Count => Length; + + DateOnly? IReadOnlyList.this[int index] => GetDateOnly(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateOnly(index); + }; + } #endif + + int IReadOnlyCollection.Count => Length; + + DateTime? IReadOnlyList.this[int index] => GetDateTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetDateTime(index); + }; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs index 01724e2acda3e..0e3ec56740449 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal128Array.cs @@ -151,6 +151,30 @@ public Decimal128Array(ArrayData data) return DecimalUtility.GetDecimal(ValueBuffer, index, Scale, ByteWidth); } + public IList ToList(bool includeNulls = false) + { + var list = new List(Length); + + for (int i = 0; i < Length; i++) + { + decimal? value = GetValue(i); + + if (value.HasValue) + { + list.Add(value.Value); + } + else + { + if (includeNulls) + { + list.Add(null); + } + } + } + + return list; + } + public string GetString(int index) { if (IsNull(index)) diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs index f314c2d6ebc9e..94a47f258280e 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs @@ -157,6 +157,30 @@ public Decimal256Array(ArrayData data) return DecimalUtility.GetDecimal(ValueBuffer, index, Scale, ByteWidth); } + public IList ToList(bool includeNulls = false) + { + var list = new List(Length); + + for (int i = 0; i < Length; i++) + { + decimal? value = GetValue(i); + + if (value.HasValue) + { + list.Add(value.Value); + } + else + { + if (includeNulls) + { + list.Add(null); + } + } + } + + return list; + } + public string GetString(int index) { if (IsNull(index)) diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs index 7365a77b6329e..0456c5cc65ba4 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs @@ -14,12 +14,13 @@ // limitations under the License. using System; +using System.Collections; using System.Collections.Generic; using System.Runtime.CompilerServices; namespace Apache.Arrow { - public abstract class PrimitiveArray : Array + public abstract class PrimitiveArray : Array, IReadOnlyList where T : struct { protected PrimitiveArray(ArrayData data) @@ -66,5 +67,24 @@ protected PrimitiveArray(ArrayData data) return list; } + + int IReadOnlyCollection.Count => Length; + T? IReadOnlyList.this[int index] => GetValue(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return IsValid(index) ? Values[index] : null; + } + } + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return IsValid(index) ? Values[index] : null; + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index 42104b27175a9..af77fe1b1a83d 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -15,13 +15,14 @@ using Apache.Arrow.Types; using System; +using System.Collections; using System.Collections.Generic; using System.Runtime.InteropServices; using System.Text; namespace Apache.Arrow { - public class StringArray: BinaryArray + public class StringArray: BinaryArray, IReadOnlyList { public static readonly Encoding DefaultEncoding = Encoding.UTF8; @@ -91,5 +92,19 @@ public string GetString(int index, Encoding encoding = default) return encoding.GetString(data, bytes.Length); } } + + int IReadOnlyCollection.Count => Length; + + string IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + }; + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/StructArray.cs b/csharp/src/Apache.Arrow/Arrays/StructArray.cs index 11d40e6d4e886..5b827c7b85e85 100644 --- a/csharp/src/Apache.Arrow/Arrays/StructArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StructArray.cs @@ -72,11 +72,11 @@ private IReadOnlyList InitializeFields() IRecordType IArrowRecord.Schema => (StructType)Data.DataType; - int IArrowRecord.ColumnCount => _fields.Count; + int IArrowRecord.ColumnCount => Fields.Count; IArrowArray IArrowRecord.Column(string columnName, IEqualityComparer comparer) => - _fields[((StructType)Data.DataType).GetFieldIndex(columnName, comparer)]; + Fields[((StructType)Data.DataType).GetFieldIndex(columnName, comparer)]; - IArrowArray IArrowRecord.Column(int columnIndex) => _fields[columnIndex]; + IArrowArray IArrowRecord.Column(int columnIndex) => Fields[columnIndex]; } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs index 824694cd6d04b..e9c2d7a4d9b28 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; using System.IO; namespace Apache.Arrow @@ -24,6 +25,9 @@ namespace Apache.Arrow /// stored as the number of seconds/ milliseconds (depending on the Time32Type) since midnight. /// public class Time32Array : PrimitiveArray +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { /// /// The class can be used to fluently build objects. @@ -155,6 +159,18 @@ public Time32Array(ArrayData data) _ => throw new InvalidDataException($"Unsupported time unit for Time32Type: {unit}") }; } + + int IReadOnlyCollection.Count => Length; + + TimeOnly? IReadOnlyList.this[int index] => GetTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTime(index); + }; + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs index 3369893304414..fc18dfb8bf726 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs @@ -15,6 +15,7 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; using System.IO; namespace Apache.Arrow @@ -24,6 +25,9 @@ namespace Apache.Arrow /// stored as the number of microseconds/nanoseconds (depending on the Time64Type) since midnight. /// public class Time64Array : PrimitiveArray +#if NET6_0_OR_GREATER + , IReadOnlyList +#endif { /// /// The class can be used to fluently build objects. @@ -146,6 +150,18 @@ public Time64Array(ArrayData data) return new TimeOnly(((Time64Type)Data.DataType).Unit.ConvertToTicks(value.Value)); } + + int IReadOnlyCollection.Count => Length; + + TimeOnly? IReadOnlyList.this[int index] => GetTime(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTime(index); + }; + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs index 0dc5726d01734..ccb656854a5df 100644 --- a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs @@ -15,12 +15,13 @@ using Apache.Arrow.Types; using System; +using System.Collections.Generic; using System.Diagnostics; using System.IO; namespace Apache.Arrow { - public class TimestampArray: PrimitiveArray + public class TimestampArray : PrimitiveArray, IReadOnlyList { private static readonly DateTimeOffset s_epoch = new DateTimeOffset(1970, 1, 1, 0, 0, 0, 0, TimeSpan.Zero); @@ -145,5 +146,16 @@ public DateTimeOffset GetTimestampUnchecked(int index) return GetTimestampUnchecked(index); } + int IReadOnlyCollection.Count => Length; + + DateTimeOffset? IReadOnlyList.this[int index] => GetTimestamp(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetTimestamp(index); + }; + } } } diff --git a/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs b/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs index 410c22885a984..6bdd131763f28 100644 --- a/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs +++ b/csharp/src/Apache.Arrow/ArrowBuffer.BitmapBuilder.cs @@ -98,7 +98,7 @@ public BitmapBuilder Append(bool value) public BitmapBuilder Append(ReadOnlySpan source, int validBits) { if (!source.IsEmpty && validBits > source.Length * 8) - throw new ArgumentException($"Number of valid bits ({validBits}) cannot be greater than the the source span length ({source.Length * 8} bits).", nameof(validBits)); + throw new ArgumentException($"Number of valid bits ({validBits}) cannot be greater than the source span length ({source.Length * 8} bits).", nameof(validBits)); // Check if memory copy can be used from the source array (performance optimization for byte-aligned coping) if (!source.IsEmpty && Length % 8 == 0) diff --git a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/ByteBuffer.cs b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/ByteBuffer.cs index e65735c347a6d..c3b3a1766a532 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/ByteBuffer.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/ByteBuffer.cs @@ -208,7 +208,7 @@ public static int SizeOf() /// Checks if the Type provided is supported as scalar value /// /// The Type to check - /// True if the type is a scalar type that is supported, falsed otherwise + /// True if the type is a scalar type that is supported, false otherwise public static bool IsSupportedType() { return genericSizes.ContainsKey(typeof(T)); diff --git a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferBuilder.cs b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferBuilder.cs index aa17d23867f99..422c9403b2dd9 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferBuilder.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferBuilder.cs @@ -67,7 +67,7 @@ public FlatBufferBuilder(int initialSize) } /// - /// Create a FlatBufferBuilder backed by the pased in ByteBuffer + /// Create a FlatBufferBuilder backed by the passed in ByteBuffer /// /// The ByteBuffer to write to public FlatBufferBuilder(ByteBuffer buffer) @@ -474,7 +474,7 @@ public VectorOffset CreateVectorOfTables(Offset[] offsets) where T : struc return EndVector(); } - /// @cond FLATBUFFERS_INTENRAL + /// @cond FLATBUFFERS_INTERNAL public void Nested(int obj) { // Structs are always stored inline, so need to be created right diff --git a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferVerify.cs b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferVerify.cs index b108aa3e28ec3..418d38e32bbb9 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferVerify.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/FlatBufferVerify.cs @@ -117,7 +117,7 @@ public Verifier() /// The Constructor of the Verifier object with input parameters: ByteBuffer and/or Options /// Input flat byte buffer defined as ByteBuffer type - /// Options object with settings for the coniguration the Verifier + /// Options object with settings for the configuration the Verifier public Verifier(ByteBuffer buf, Options options = null) { verifier_buffer = buf; @@ -261,7 +261,7 @@ private short GetVRelOffset(int pos, short vtableOffset) } /// Get table data area absolute offset from vtable. Result is the absolute buffer offset. - /// The result value offset cannot be '0' (pointing to itself) so after validation this method returnes '0' + /// The result value offset cannot be '0' (pointing to itself) so after validation this method returns '0' /// value as a marker for missing optional entry /// Table Position value in the Byte Buffer /// offset value in the Table @@ -273,7 +273,7 @@ private uint GetVOffset(uint tablePos, short vtableOffset) short relPos = GetVRelOffset(Convert.ToInt32(tablePos), vtableOffset); if (relPos != 0) { - // Calculate offset based on table postion + // Calculate offset based on table position UOffset = Convert.ToUInt32(tablePos + relPos); } else @@ -482,7 +482,7 @@ public bool VerifyTableEnd(uint tablePos) return true; } - /// Verifiy static/inlined data area field + /// Verify static/inlined data area field /// Position in the Table /// Offset to the static/inlined data element /// Size of the element @@ -633,9 +633,9 @@ public bool VerifyNestedBuffer(uint tablePos, short offsetId, VerifyTableAction var vecStart = vecOffset + SIZE_U_OFFSET; // Create and Copy nested buffer bytes from part of Verify Buffer var nestedByteBuffer = new ByteBuffer(verifier_buffer.ToArray(Convert.ToInt32(vecStart), Convert.ToInt32(vecLength))); - var nestedVerifyier = new Verifier(nestedByteBuffer, options); + var nestedVerifier = new Verifier(nestedByteBuffer, options); // There is no internal identifier - use empty one - if (!nestedVerifyier.CheckBufferFromStart("", 0, verifyAction)) + if (!nestedVerifier.CheckBufferFromStart("", 0, verifyAction)) { return false; } @@ -643,7 +643,7 @@ public bool VerifyNestedBuffer(uint tablePos, short offsetId, VerifyTableAction return true; } - /// Verifiy static/inlined data area at absolute offset + /// Verify static/inlined data area at absolute offset /// Position of static/inlined data area in the Byte Buffer /// Size of the union data /// Alignment bool value @@ -705,7 +705,7 @@ public bool VerifyUnion(uint tablePos, short typeIdVOffset, short valueVOffset, /// Verify vector of unions (objects). Unions are verified using generated verifyObjFunc /// Position of the Table /// Offset in the Table (Union type id) - /// Offset to vector of Data Stucture offset + /// Offset to vector of Data Structure offset /// Verification Method used for Union /// Required Value when the offset == 0 /// Return True when the verification of the Vector of Unions passed diff --git a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/Table.cs b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/Table.cs index e2452e40eb8e9..860ba550713a6 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/Table.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/FlatBuffers/Table.cs @@ -91,7 +91,7 @@ public int __vector(int offset) } #if ENABLE_SPAN_T && (UNSAFE_BYTEBUFFER || NETSTANDARD2_1) - // Get the data of a vector whoses offset is stored at "offset" in this object as an + // Get the data of a vector whose offset is stored at "offset" in this object as an // Spant<byte>. If the vector is not present in the ByteBuffer, // then an empty span will be returned. public Span __vector_as_span(int offset, int elementSize) where T : struct @@ -113,7 +113,7 @@ public Span __vector_as_span(int offset, int elementSize) where T : struct return MemoryMarshal.Cast(bb.ToSpan(pos, len * elementSize)); } #else - // Get the data of a vector whoses offset is stored at "offset" in this object as an + // Get the data of a vector whose offset is stored at "offset" in this object as an // ArraySegment<byte>. If the vector is not present in the ByteBuffer, // then a null value will be returned. public ArraySegment? __vector_as_arraysegment(int offset) @@ -130,7 +130,7 @@ public Span __vector_as_span(int offset, int elementSize) where T : struct } #endif - // Get the data of a vector whoses offset is stored at "offset" in this object as an + // Get the data of a vector whose offset is stored at "offset" in this object as an // T[]. If the vector is not present in the ByteBuffer, then a null value will be // returned. public T[] __vector_as_array(int offset) diff --git a/csharp/src/Apache.Arrow/Flatbuf/Types/Timestamp.cs b/csharp/src/Apache.Arrow/Flatbuf/Types/Timestamp.cs index 93620ea577899..e7b2ac3041ea0 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Types/Timestamp.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Types/Timestamp.cs @@ -108,7 +108,7 @@ namespace Apache.Arrow.Flatbuf /// no indication of how to map this information to a physical point in time. /// Naive date-times must be handled with care because of this missing /// information, and also because daylight saving time (DST) may make -/// some values ambiguous or non-existent. A naive date-time may be +/// some values ambiguous or nonexistent. A naive date-time may be /// stored as a struct with Date and Time fields. However, it may also be /// encoded into a Timestamp column with an empty timezone. The timestamp /// values should be computed "as if" the timezone of the date-time values diff --git a/csharp/src/Apache.Arrow/Properties/AssembyInfo.cs b/csharp/src/Apache.Arrow/Properties/AssemblyInfo.cs similarity index 100% rename from csharp/src/Apache.Arrow/Properties/AssembyInfo.cs rename to csharp/src/Apache.Arrow/Properties/AssemblyInfo.cs diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 475d7ccc3ef28..c222dc0bca08b 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,8 +8,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 656ee6a2470e4..0de93b470a201 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 53fdd6d62dbcb..c227abbed4c5d 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 66becb84c5b66..5b36e369b1961 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,8 +15,8 @@ - - + + all runtime; build; native; contentfiles; analyzers diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index 96918ff091639..269c2390a70fa 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -14,6 +14,8 @@ // limitations under the License. using System; +using System.Collections; +using System.Collections.Generic; using System.Numerics; using Xunit; @@ -93,6 +95,33 @@ void TestIsValid(ArrowBuffer valueBuf, ArrowBuffer nullBitmapBuf, int length, in } } + [Fact] + public void EnumerateArray() + { + var array = new Int64Array.Builder().Append(1).Append(2).Build(); + + foreach(long? foo in (IEnumerable)array) + { + Assert.InRange(foo.Value, 1, 2); + } + + foreach (object foo in (IEnumerable)array) + { + Assert.InRange((long)foo, 1, 2); + } + } + + [Fact] + public void ArrayAsReadOnlyList() + { + Int64Array array = new Int64Array.Builder().Append(1).Append(2).Build(); + var readOnlyList = (IReadOnlyList)array; + + Assert.Equal(array.Length, readOnlyList.Count); + Assert.Equal(readOnlyList[0], 1); + Assert.Equal(readOnlyList[1], 2); + } + #if NET5_0_OR_GREATER [Fact] public void SliceArray() diff --git a/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs index dec8274962442..a4e698f543bdc 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowBufferBitmapBuilderTests.cs @@ -128,7 +128,7 @@ public void BitsAreAppendedToEmptyBuilder(byte[] bytesToAppend, [InlineData(new byte[] { 254 }, 4, 12, 11, 1)] [InlineData(new byte[] { 254, 1 }, 9, 17, 16, 1)] [InlineData(new byte[] { 249, 1 }, 9, 17, 15, 2)] - public void BitsAreAppendedToBuilderContainingByteAllignedData(byte[] bytesToAppend, + public void BitsAreAppendedToBuilderContainingByteAlignedData(byte[] bytesToAppend, int validBits, int expectedLength, int expectedSetBitCount, @@ -154,7 +154,7 @@ public void BitsAreAppendedToBuilderContainingByteAllignedData(byte[] bytesToApp [InlineData(new byte[] { 254 }, 4, 13, 12, 1)] [InlineData(new byte[] { 254, 1 }, 9, 18, 17, 1)] [InlineData(new byte[] { 249, 1 }, 9, 18, 16, 2)] - public void BitsAreAppendedToBuilderContainingNotAllignedData(byte[] bytesToAppend, + public void BitsAreAppendedToBuilderContainingNotAlignedData(byte[] bytesToAppend, int validBits, int expectedLength, int expectedSetBitCount, @@ -369,7 +369,7 @@ public void CapacityIncreased(int initialCapacity, int numBitsToAppend, int addi } [Fact] - public void NegtativeCapacityThrows() + public void NegativeCapacityThrows() { // Arrange var builder = new ArrowBuffer.BitmapBuilder(); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs index a310a3609af07..69b8410d030f2 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs @@ -57,7 +57,7 @@ public void Ctor_LeaveOpenTrue_StreamValidOnDispose() /// /// [Fact] - public async Task WritesFooterAlignedMulitpleOf8() + public async Task WritesFooterAlignedMultipleOf8() { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); @@ -85,7 +85,7 @@ public async Task WritesFooterAlignedMulitpleOf8() /// /// [Fact] - public async Task WritesFooterAlignedMulitpleOf8Async() + public async Task WritesFooterAlignedMultipleOf8Async() { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); diff --git a/csharp/test/Apache.Arrow.Tests/RecordTests.cs b/csharp/test/Apache.Arrow.Tests/RecordTests.cs index 09b0d2c6655ba..cfca4556b63a6 100644 --- a/csharp/test/Apache.Arrow.Tests/RecordTests.cs +++ b/csharp/test/Apache.Arrow.Tests/RecordTests.cs @@ -74,7 +74,25 @@ public void VisitStructAndBatch() StructArray level2Array = new StructArray(level2, stringArray.Length, new[] { level1Array }, nulls); RecordBatch batch = new RecordBatch(schema, new IArrowArray[] { level2Array }, stringArray.Length); + var visitor3 = new TestArrayVisitor1(); + visitor3.Visit(batch); + Assert.Equal("111utf8", visitor3.stringBuilder.ToString()); + var visitor4 = new TestArrayVisitor2(); + visitor4.Visit(batch); + Assert.Equal("322utf8", visitor4.stringBuilder.ToString()); + } + + [Fact] + public void LazyStructInitialization() + { + StringArray stringArray = new StringArray.Builder().Append("one").AppendNull().AppendNull().Append("four").Build(); + Field stringField = new Field("column1", StringType.Default, true); + StructType structType = new StructType(new[] { stringField }); + ArrayData structData = new ArrayData(structType, stringArray.Length, 0, 0, new[] { ArrowBuffer.Empty }, new[] { stringArray.Data }); + IArrowRecord structArray = new StructArray(structData); + Assert.Equal(1, structArray.ColumnCount); + Assert.Equal(structArray.Length, structArray.Column(0).Length); } private class TestTypeVisitor1 : IArrowTypeVisitor, IArrowTypeVisitor diff --git a/dev/archery/README.md b/dev/archery/README.md index 60417db047505..9991e7402d832 100644 --- a/dev/archery/README.md +++ b/dev/archery/README.md @@ -27,7 +27,7 @@ Archery is documented on the Arrow website: # Installing Archery -See the pages linked aboved for more details. As a general overview, Archery +See the pages linked above for more details. As a general overview, Archery comes in a number of subpackages, each needing to be installed if you want to use the functionality of it: diff --git a/dev/archery/archery/bot.py b/dev/archery/archery/bot.py index e8fbcfcb0f3a3..68b24dc08d71b 100644 --- a/dev/archery/archery/bot.py +++ b/dev/archery/archery/bot.py @@ -333,7 +333,7 @@ def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): dest : Path Filesystem path to clone the repositories to. crossbow_repo : str - Github repository name, like kszucs/crossbow. + GitHub repository name, like kszucs/crossbow. pull_request : pygithub.PullRequest Object containing information about the pull request the comment bot was triggered from. diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 7a3b45f9788e6..32b094263098c 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -42,7 +42,7 @@ BOOL = ArrowBool() -@click.group() +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--debug", type=BOOL, is_flag=True, default=False, help="Increase logging with debugging output.") @click.option("--pdb", type=BOOL, is_flag=True, default=False, @@ -167,7 +167,7 @@ def _apply_options(cmd, options): @click.option("--with-hdfs", default=None, type=BOOL, help="Build the Arrow HDFS bridge.") @click.option("--with-hiveserver2", default=None, type=BOOL, - help="Build the HiveServer2 client and arrow adapater.") + help="Build the HiveServer2 client and arrow adapter.") @click.option("--with-ipc", default=None, type=BOOL, help="Build the Arrow IPC extensions.") @click.option("--with-json", default=None, type=BOOL, @@ -177,7 +177,7 @@ def _apply_options(cmd, options): @click.option("--with-parquet", default=None, type=BOOL, help="Build with Parquet file support.") @click.option("--with-python", default=None, type=BOOL, - help="Build the Arrow CPython extesions.") + help="Build the Arrow CPython extensions.") @click.option("--with-r", default=None, type=BOOL, help="Build the Arrow R extensions. This is not a CMake option, " "it will toggle required options") diff --git a/dev/archery/archery/crossbow/cli.py b/dev/archery/archery/crossbow/cli.py index a44115c2bc6ea..d30d012859815 100644 --- a/dev/archery/archery/crossbow/cli.py +++ b/dev/archery/archery/crossbow/cli.py @@ -174,7 +174,7 @@ def submit(obj, tasks, groups, params, job_prefix, config_path, arrow_version, 'locally. Examples: https://github.com/apache/arrow or ' 'https://github.com/raulcd/arrow.') @click.option('--rc', default=None, - help='Relase Candidate number.') + help='Release Candidate number.') @click.option('--version', default=None, help='Release version.') @click.option('--verify-binaries', is_flag=True, default=False, diff --git a/dev/archery/archery/crossbow/core.py b/dev/archery/archery/crossbow/core.py index 3353c30b1a67c..57e91e206748d 100644 --- a/dev/archery/archery/crossbow/core.py +++ b/dev/archery/archery/crossbow/core.py @@ -502,7 +502,7 @@ def github_upload_asset_requests(self, release, path, name, mime, logger.error('Attempt {} has failed with message: {}.' .format(i + 1, str(e))) logger.error('Error message {}'.format(e.msg)) - logger.error('List of errors provided by Github:') + logger.error('List of errors provided by GitHub:') for err in e.errors: logger.error(' - {}'.format(err)) @@ -526,7 +526,7 @@ def github_upload_asset_requests(self, release, path, name, mime, time.sleep(retry_backoff) - raise RuntimeError('Github asset uploading has failed!') + raise RuntimeError('GitHub asset uploading has failed!') def github_upload_asset_curl(self, release, path, name, mime): upload_url, _ = release.upload_url.split('{?') @@ -1029,8 +1029,8 @@ class TaskAssets(dict): def __init__(self, github_release, artifact_patterns, validate_patterns=True): - # HACK(kszucs): don't expect uploaded assets of no atifacts were - # defiened for the tasks in order to spare a bit of github rate limit + # HACK(kszucs): don't expect uploaded assets of no artifacts were + # defined for the tasks in order to spare a bit of github rate limit if not artifact_patterns: return @@ -1142,7 +1142,7 @@ def show(self, stream=None): @classmethod def from_config(cls, config, target, tasks=None, groups=None, params=None): """ - Intantiate a job from based on a config. + Instantiate a job from based on a config. Parameters ---------- @@ -1302,7 +1302,7 @@ def select(self, tasks=None, groups=None): } def validate(self): - # validate that the task groups are properly refering to the tasks + # validate that the task groups are properly referring to the tasks for group_name, group in self['groups'].items(): for pattern in group: # remove the negation character for blocklisted tasks diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py index ea10e75ad3478..d8efa42341ce6 100644 --- a/dev/archery/archery/crossbow/reports.py +++ b/dev/archery/archery/crossbow/reports.py @@ -282,7 +282,7 @@ class CommentReport(Report): badges = { 'github': _markdown_badge.format( - title='Github Actions', + title='GitHub Actions', badge=( 'https://github.com/{repo}/actions/workflows/crossbow.yml/' 'badge.svg?branch={branch}' diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml index eb03bbee0bd6d..68915a3f3df70 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml +++ b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml @@ -1,7 +1,7 @@ !Job target: !Target head: f766a1d615dd1b7ee706d05102e579195951a61c - email: unkown + email: unknown branch: refs/pull/4435/merge remote: https://github.com/apache/arrow version: 0.13.0.dev306 diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml index f6de07dd456c8..e8224ef01a37f 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml +++ b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml @@ -1,7 +1,7 @@ !Job target: !Target head: f766a1d615dd1b7ee706d05102e579195951a61c - email: unkown + email: unknown branch: refs/pull/4435/merge remote: https://github.com/apache/arrow version: 0.13.0.dev306 diff --git a/dev/archery/archery/integration/cdata.py b/dev/archery/archery/integration/cdata.py index 8e5550fcdb9c5..a5dbbe29d8aba 100644 --- a/dev/archery/archery/integration/cdata.py +++ b/dev/archery/archery/integration/cdata.py @@ -18,10 +18,19 @@ import cffi from contextlib import contextmanager import functools +import os +import sys from .tester import CDataExporter, CDataImporter +if sys.platform == "darwin": + dll_suffix = ".dylib" +elif os.name == "nt": + dll_suffix = ".dll" +else: + dll_suffix = ".so" + _c_data_decls = """ struct ArrowSchema { // Array type description diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index ff10c0bb03fb6..80cc1c1e76425 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1858,9 +1858,7 @@ def _temp_path(): .skip_tester('Go') .skip_tester('Java') .skip_tester('JS') - .skip_tester('Rust') - .skip_format(SKIP_C_SCHEMA, 'C++') - .skip_format(SKIP_C_ARRAY, 'C++'), + .skip_tester('Rust'), generate_extension_case() .skip_tester('C#') diff --git a/dev/archery/archery/integration/tester_cpp.py b/dev/archery/archery/integration/tester_cpp.py index 658e71330155e..2a47bc830886a 100644 --- a/dev/archery/archery/integration/tester_cpp.py +++ b/dev/archery/archery/integration/tester_cpp.py @@ -18,7 +18,6 @@ import contextlib import functools import os -import sys import subprocess from . import cdata @@ -42,15 +41,8 @@ "localhost", ] -if sys.platform == "darwin": - _dll_suffix = ".dylib" -elif os.name == "nt": - _dll_suffix = ".dll" -else: - _dll_suffix = ".so" - _DLL_PATH = _EXE_PATH -_ARROW_DLL = os.path.join(_DLL_PATH, "libarrow" + _dll_suffix) +_ARROW_DLL = os.path.join(_DLL_PATH, "libarrow" + cdata.dll_suffix) class CppTester(Tester): @@ -175,6 +167,7 @@ def make_c_data_importer(self): @functools.lru_cache def _load_ffi(ffi, lib_path=_ARROW_DLL): + os.environ['ARROW_DEBUG_MEMORY_POOL'] = 'trap' ffi.cdef(_cpp_c_data_entrypoints) dll = ffi.dlopen(lib_path) dll.ArrowCpp_CDataIntegration_ExportSchemaFromJson diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py index 7dca525673ba6..4f7765641130d 100644 --- a/dev/archery/archery/integration/tester_csharp.py +++ b/dev/archery/archery/integration/tester_csharp.py @@ -38,6 +38,7 @@ def _load_clr(): global _clr_loaded if not _clr_loaded: _clr_loaded = True + os.environ['DOTNET_GCHeapHardLimit'] = '0xC800000' # 200 MiB import pythonnet pythonnet.load("coreclr") import clr diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py index 2b3dc3a1be336..b59cd9d113291 100644 --- a/dev/archery/archery/integration/tester_go.py +++ b/dev/archery/archery/integration/tester_go.py @@ -18,7 +18,6 @@ import contextlib import functools import os -import sys import subprocess from . import cdata @@ -43,17 +42,10 @@ "localhost", ] -if sys.platform == "darwin": - _dll_suffix = ".dylib" -elif os.name == "nt": - _dll_suffix = ".dll" -else: - _dll_suffix = ".so" - _DLL_PATH = os.path.join( ARROW_ROOT_DEFAULT, "go/arrow/internal/cdata_integration") -_INTEGRATION_DLL = os.path.join(_DLL_PATH, "arrow_go_integration" + _dll_suffix) +_INTEGRATION_DLL = os.path.join(_DLL_PATH, "arrow_go_integration" + cdata.dll_suffix) class GoTester(Tester): @@ -167,6 +159,9 @@ def make_c_data_importer(self): @functools.lru_cache def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + # NOTE that setting Go environment variables here (such as GODEBUG) + # would be ignored by the Go runtime. The environment variables need + # to be set from the process calling Archery. ffi.cdef(_go_c_data_entrypoints) dll = ffi.dlopen(lib_path) return dll diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 5684798d794ad..6cd1afa64feb8 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -34,11 +34,15 @@ def load_version_from_pom(): return version_tag.text -# XXX Should we add "-Darrow.memory.debug.allocator=true"? It adds a couple -# minutes to total CPU usage of the integration test suite. +# NOTE: we don't add "-Darrow.memory.debug.allocator=true" here as it adds a +# couple minutes to total CPU usage of the integration test suite +# (see setup_jpype() below). _JAVA_OPTS = [ "-Dio.netty.tryReflectionSetAccessible=true", "-Darrow.struct.conflict.policy=CONFLICT_APPEND", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + # GH-39113: avoid failures accessing files in `/tmp/hsperfdata_...` + "-XX:-UsePerfData", ] _arrow_version = load_version_from_pom() @@ -80,7 +84,12 @@ def setup_jpype(): jar_path = f"{_ARROW_TOOLS_JAR}:{_ARROW_C_DATA_JAR}" # XXX Didn't manage to tone down the logging level here (DEBUG -> INFO) jpype.startJVM(jpype.getDefaultJVMPath(), - "-Djava.class.path=" + jar_path, *_JAVA_OPTS) + "-Djava.class.path=" + jar_path, + # This flag is too heavy for IPC and Flight tests + "-Darrow.memory.debug.allocator=true", + # Reduce internal use of signals by the JVM + "-Xrs", + *_JAVA_OPTS) class _CDataBase: diff --git a/dev/archery/archery/integration/tester_rust.py b/dev/archery/archery/integration/tester_rust.py index c7a94de2197bd..56b07859dc82a 100644 --- a/dev/archery/archery/integration/tester_rust.py +++ b/dev/archery/archery/integration/tester_rust.py @@ -16,15 +16,19 @@ # under the License. import contextlib +import functools import os import subprocess -from .tester import Tester +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log from ..utils.source import ARROW_ROOT_DEFAULT -_EXE_PATH = os.path.join(ARROW_ROOT_DEFAULT, "rust/target/debug") +_EXE_PATH = os.environ.get( + "ARROW_RUST_EXE_PATH", os.path.join(ARROW_ROOT_DEFAULT, "rust/target/debug") +) _INTEGRATION_EXE = os.path.join(_EXE_PATH, "arrow-json-integration-test") _STREAM_TO_FILE = os.path.join(_EXE_PATH, "arrow-stream-to-file") _FILE_TO_STREAM = os.path.join(_EXE_PATH, "arrow-file-to-stream") @@ -37,12 +41,19 @@ "localhost", ] +_INTEGRATION_DLL = os.path.join(_EXE_PATH, + "libarrow_integration_testing" + cdata.dll_suffix) + class RustTester(Tester): PRODUCER = True CONSUMER = True FLIGHT_SERVER = True FLIGHT_CLIENT = True + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True name = 'Rust' @@ -117,3 +128,102 @@ def flight_request(self, port, json_path=None, scenario_name=None): if self.debug: log(' '.join(cmd)) run_cmd(cmd) + + def make_c_data_exporter(self): + return RustCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return RustCDataImporter(self.debug, self.args) + + +_rust_c_data_entrypoints = """ + const char* arrow_rs_cdata_integration_export_schema_from_json( + const char* json_path, uintptr_t out); + const char* arrow_rs_cdata_integration_import_schema_and_compare_to_json( + const char* json_path, uintptr_t c_schema); + + const char* arrow_rs_cdata_integration_export_batch_from_json( + const char* json_path, int num_batch, uintptr_t out); + const char* arrow_rs_cdata_integration_import_batch_and_compare_to_json( + const char* json_path, int num_batch, uintptr_t c_array); + + void arrow_rs_free_error(const char*); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_rust_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _pointer_to_int(self, c_ptr): + return self.ffi.cast('uintptr_t', c_ptr) + + def _check_rust_error(self, rs_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is dynamically allocated on the Rust side. + """ + assert self.ffi.typeof(rs_error) is self.ffi.typeof("const char*") + if rs_error != self.ffi.NULL: + try: + error = self.ffi.string(rs_error).decode( + 'utf8', errors='replace') + raise RuntimeError( + f"Rust C Data Integration call failed: {error}") + finally: + self.dll.arrow_rs_free_error(rs_error) + + +class RustCDataExporter(CDataExporter, _CDataBase): + + def export_schema_from_json(self, json_path, c_schema_ptr): + rs_error = self.dll.arrow_rs_cdata_integration_export_schema_from_json( + str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + self._check_rust_error(rs_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + rs_error = self.dll.arrow_rs_cdata_integration_export_batch_from_json( + str(json_path).encode(), num_batch, + self._pointer_to_int(c_array_ptr)) + self._check_rust_error(rs_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + # FIXME we should track the amount of Rust-allocated memory (GH-38822) + return 0 + + +class RustCDataImporter(CDataImporter, _CDataBase): + + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + rs_error = \ + self.dll.arrow_rs_cdata_integration_import_schema_and_compare_to_json( + str(json_path).encode(), self._pointer_to_int(c_schema_ptr)) + self._check_rust_error(rs_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, + c_array_ptr): + rs_error = \ + self.dll.arrow_rs_cdata_integration_import_batch_and_compare_to_json( + str(json_path).encode(), num_batch, self._pointer_to_int(c_array_ptr)) + self._check_rust_error(rs_error) + + @property + def supports_releasing_memory(self): + return True diff --git a/dev/archery/archery/release/cli.py b/dev/archery/archery/release/cli.py index ed15dcb1ed6dc..92fdbb801f357 100644 --- a/dev/archery/archery/release/cli.py +++ b/dev/archery/archery/release/cli.py @@ -32,7 +32,7 @@ help='OAuth token for GitHub authentication') @click.pass_obj def release(obj, src, github_token): - """Release releated commands.""" + """Release related commands.""" obj['issue_tracker'] = IssueTracker(github_token=github_token) obj['repo'] = src.path @@ -98,7 +98,7 @@ def release_changelog_generate(obj, version, output): @release_changelog.command('regenerate') @click.pass_obj def release_changelog_regenerate(obj): - """Regeneretate the whole CHANGELOG.md file""" + """Regenerate the whole CHANGELOG.md file""" issue_tracker, repo = obj['issue_tracker'], obj['repo'] changelogs = [] issue_tracker = IssueTracker(issue_tracker=issue_tracker) diff --git a/dev/archery/archery/release/core.py b/dev/archery/archery/release/core.py index e08f52baa216d..d6eab45e1804c 100644 --- a/dev/archery/archery/release/core.py +++ b/dev/archery/archery/release/core.py @@ -468,7 +468,7 @@ def curate(self, minimal=False): parquet.append((self.jira_instance.issue(c.issue), c)) else: warnings.warn( - f'Issue {c.issue} is not MINOR nor pertains to GH' + + f'Issue {c.issue} does not pertain to GH' + ', ARROW or PARQUET') outside.append((c.issue, c)) diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json b/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json index d591105f0798b..2ff052966d1e3 100644 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json +++ b/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json @@ -57,7 +57,7 @@ }, "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", "state": "open", - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "updated_at": "2019-04-05T11:55:43Z", "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", "user": { diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json b/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json index 5a8f3461c0ca9..7b29d7316b2cf 100644 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json +++ b/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json @@ -57,7 +57,7 @@ }, "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", "state": "open", - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "updated_at": "2019-04-05T11:35:47Z", "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", "user": { diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json b/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json index bfb7210df8a3a..419a068c5dbc8 100644 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json +++ b/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json @@ -57,7 +57,7 @@ }, "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", "state": "open", - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "updated_at": "2019-04-05T11:35:47Z", "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", "user": { diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json b/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json index a3d450078aeb0..39b4895cd043e 100644 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json +++ b/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json @@ -57,7 +57,7 @@ }, "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", "state": "open", - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "updated_at": "2019-04-05T11:26:56Z", "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", "user": { diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json b/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json index c88197c8e0244..7022a4315f4d0 100644 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json +++ b/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json @@ -62,7 +62,7 @@ }, "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", "state": "open", - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "updated_at": "2019-04-05T11:35:46Z", "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", "user": { diff --git a/dev/archery/archery/tests/fixtures/event-pr-review-committer.json b/dev/archery/archery/tests/fixtures/event-pr-review-committer.json index 0022ced8a4f32..9c02e373cb530 100644 --- a/dev/archery/archery/tests/fixtures/event-pr-review-committer.json +++ b/dev/archery/archery/tests/fixtures/event-pr-review-committer.json @@ -12,7 +12,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/event-pr-review-non-committer.json b/dev/archery/archery/tests/fixtures/event-pr-review-non-committer.json index 8f305ff0a2ddd..5ef1b20c786fa 100644 --- a/dev/archery/archery/tests/fixtures/event-pr-review-non-committer.json +++ b/dev/archery/archery/tests/fixtures/event-pr-review-non-committer.json @@ -12,7 +12,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/event-pull-request-opened.json b/dev/archery/archery/tests/fixtures/event-pull-request-opened.json index 9cf5c0dda7843..19f37de7bc10d 100644 --- a/dev/archery/archery/tests/fixtures/event-pull-request-opened.json +++ b/dev/archery/archery/tests/fixtures/event-pull-request-opened.json @@ -12,7 +12,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-committer.json b/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-committer.json index 67a4836c7635f..0a780c3ab0557 100644 --- a/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-committer.json +++ b/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-committer.json @@ -26,7 +26,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-non-committer.json b/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-non-committer.json index 6f4db37ccb465..97d0fe09264b5 100644 --- a/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-non-committer.json +++ b/dev/archery/archery/tests/fixtures/event-pull-request-target-opened-non-committer.json @@ -26,7 +26,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/event-pull-request-target-synchronize.json b/dev/archery/archery/tests/fixtures/event-pull-request-target-synchronize.json index e7e7055e964c2..1be6bfe0acb4f 100644 --- a/dev/archery/archery/tests/fixtures/event-pull-request-target-synchronize.json +++ b/dev/archery/archery/tests/fixtures/event-pull-request-target-synchronize.json @@ -28,7 +28,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/issue-26.json b/dev/archery/archery/tests/fixtures/issue-26.json index 44c4d3bedef48..7f7bf53797e2a 100644 --- a/dev/archery/archery/tests/fixtures/issue-26.json +++ b/dev/archery/archery/tests/fixtures/issue-26.json @@ -8,7 +8,7 @@ "id": 429706959, "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", "number": 26, - "title": "Unittests for GithubHook + native asyncio syntax", + "title": "Unittests for GitHubHook + native asyncio syntax", "user": { "login": "kszucs", "id": 961747, @@ -46,7 +46,7 @@ "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch" }, - "body": "Resolves:\r\n- #26 Unittests for GithubHook + native asyncio syntax\r\n- #27 Use native async/await keywords instead of @inlineCallbacks and yield\r\n", + "body": "Resolves:\r\n- #26 Unittests for GitHubHook + native asyncio syntax\r\n- #27 Use native async/await keywords instead of @inlineCallbacks and yield\r\n", "closed_by": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/pull-request-26-awaiting-review.json b/dev/archery/archery/tests/fixtures/pull-request-26-awaiting-review.json index d295afb396e3c..b5805ec8c41da 100644 --- a/dev/archery/archery/tests/fixtures/pull-request-26-awaiting-review.json +++ b/dev/archery/archery/tests/fixtures/pull-request-26-awaiting-review.json @@ -9,7 +9,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/fixtures/pull-request-26-commit.json b/dev/archery/archery/tests/fixtures/pull-request-26-commit.json index ffc48943a6ca8..b5f92ccf53856 100644 --- a/dev/archery/archery/tests/fixtures/pull-request-26-commit.json +++ b/dev/archery/archery/tests/fixtures/pull-request-26-commit.json @@ -92,7 +92,7 @@ "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-build-command.json", "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-build-command.json", "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-build-command.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"I've successfully started builds for this PR\",\n+ \"created_at\": \"2019-04-05T11:55:44Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248730\",\n+ \"id\": 480248730,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODczMA==\",\n+ \"updated_at\": \"2019-04-05T11:55:44Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248730\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 4,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:55:44Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" + "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"I've successfully started builds for this PR\",\n+ \"created_at\": \"2019-04-05T11:55:44Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248730\",\n+ \"id\": 480248730,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODczMA==\",\n+ \"updated_at\": \"2019-04-05T11:55:44Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248730\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 4,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GitHubHook\",\n+ \"updated_at\": \"2019-04-05T11:55:44Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" }, { "sha": "7ef554e333327f0e62aa1fd76b4b17844a39adeb", @@ -104,7 +104,7 @@ "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-by-ursabot.json", "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-by-ursabot.json", "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-by-ursabot.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"Unknown command \\\"\\\"\",\n+ \"created_at\": \"2019-04-05T11:35:47Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815\",\n+ \"id\": 480243815,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 2,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" + "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"Unknown command \\\"\\\"\",\n+ \"created_at\": \"2019-04-05T11:35:47Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815\",\n+ \"id\": 480243815,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 2,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GitHubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" }, { "sha": "a8082dbc91fdfe815b795e49ec10e49000771ef5", @@ -116,7 +116,7 @@ "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"bear is no game\",\n+ \"created_at\": \"2019-04-05T11:26:56Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480241727\",\n+ \"id\": 480241727,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MTcyNw==\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480241727\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 0,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" + "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"bear is no game\",\n+ \"created_at\": \"2019-04-05T11:26:56Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480241727\",\n+ \"id\": 480241727,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MTcyNw==\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480241727\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 0,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GitHubHook\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" }, { "sha": "2770e29ba9086394455315e590c0b433d08e437e", @@ -128,7 +128,7 @@ "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-with-empty-command.json", "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-with-empty-command.json", "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-with-empty-command.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"@ursabot \",\n+ \"created_at\": \"2019-04-05T11:35:46Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243811\",\n+ \"id\": 480243811,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxMQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243811\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 1,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" + "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"@ursabot \",\n+ \"created_at\": \"2019-04-05T11:35:46Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243811\",\n+ \"id\": 480243811,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxMQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243811\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 1,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GitHubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" }, { "sha": "80ff46510a2f39ae60f7c3a98e5fdaef8e688784", @@ -152,7 +152,7 @@ "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/test_hooks.py", "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/test_hooks.py", "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/test_hooks.py?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -54,7 +54,7 @@ class TestGithubHook(ChangeHookTestCase):\n await self.request('ping', {})\n assert len(self.hook.master.data.updates.changesAdded) == 0\n \n- @ensure_deferred\n- async def test_issue_comment(self):\n- payload = {}\n- await self.request('issue_comment', payload)\n+ # @ensure_deferred\n+ # async def test_issue_comment(self):\n+ # payload = {}\n+ # await self.request('issue_comment', payload)" + "patch": "@@ -54,7 +54,7 @@ class TestGitHubHook(ChangeHookTestCase):\n await self.request('ping', {})\n assert len(self.hook.master.data.updates.changesAdded) == 0\n \n- @ensure_deferred\n- async def test_issue_comment(self):\n- payload = {}\n- await self.request('issue_comment', payload)\n+ # @ensure_deferred\n+ # async def test_issue_comment(self):\n+ # payload = {}\n+ # await self.request('issue_comment', payload)" } ] } \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/pull-request-26.json b/dev/archery/archery/tests/fixtures/pull-request-26.json index d295afb396e3c..b5805ec8c41da 100644 --- a/dev/archery/archery/tests/fixtures/pull-request-26.json +++ b/dev/archery/archery/tests/fixtures/pull-request-26.json @@ -9,7 +9,7 @@ "number": 26, "state": "open", "locked": false, - "title": "Unittests for GithubHook", + "title": "Unittests for GitHubHook", "user": { "login": "kszucs", "id": 961747, diff --git a/dev/archery/archery/tests/test_bot.py b/dev/archery/archery/tests/test_bot.py index b5de2dfd21b4e..5d32cdfd9a59a 100644 --- a/dev/archery/archery/tests/test_bot.py +++ b/dev/archery/archery/tests/test_bot.py @@ -103,7 +103,7 @@ def test_noop_events(load_fixture, fixture_name): handler.assert_not_called() -def test_unathorized_user_comment(load_fixture, responses): +def test_unauthorized_user_comment(load_fixture, responses): responses.add( responses.GET, github_url('/repositories/169101701/issues/26'), diff --git a/dev/conbench_envs/README.md b/dev/conbench_envs/README.md index 5a4eb58b2447c..509dc5c0c9537 100644 --- a/dev/conbench_envs/README.md +++ b/dev/conbench_envs/README.md @@ -35,7 +35,7 @@ Benchmark builds use `hooks.sh` functions as hooks to create conda env with Arro Defining hooks in Arrow repo allows benchmark builds for a specific commit to be compatible with the files/scripts *in that commit* which are used for installing Arrow -dependencies and building Arrow. This allows Arrow contributors to asses the perfomance +dependencies and building Arrow. This allows Arrow contributors to asses the performance implications of different build options, dependency versions, etc by updating `hooks.sh`. diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 0f36a5ba9025c..ae482d69014ab 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -447,7 +447,7 @@ def clear_pr_state_labels(self, number): response = requests.get(url, headers=self.headers) labels = response.json() for label in labels: - # All PR workflow state labes starts with "awaiting" + # All PR workflow state labels starts with "awaiting" if label['name'].startswith('awaiting'): label_url = f"{url}/{label['name']}" requests.delete(label_url, headers=self.headers) diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index 519c8339a4dba..df6c0778dc805 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -1088,7 +1088,6 @@ def available_apt_targets ["debian", "trixie", "main"], ["ubuntu", "focal", "main"], ["ubuntu", "jammy", "main"], - ["ubuntu", "lunar", "main"], ["ubuntu", "mantic", "main"], ] end diff --git a/dev/release/setup-rhel-rebuilds.sh b/dev/release/setup-rhel-rebuilds.sh index 9cdc50321599b..dc190d2d2426e 100755 --- a/dev/release/setup-rhel-rebuilds.sh +++ b/dev/release/setup-rhel-rebuilds.sh @@ -27,7 +27,7 @@ dnf -y install 'dnf-command(config-manager)' dnf config-manager --set-enabled powertools dnf -y update dnf -y module disable nodejs -dnf -y module enable nodejs:16 +dnf -y module enable nodejs:18 dnf -y module disable ruby dnf -y module enable ruby:2.7 dnf -y groupinstall "Development Tools" @@ -44,8 +44,7 @@ dnf -y install \ ninja-build \ nodejs \ openssl-devel \ - python38-devel \ - python38-pip \ + python3.11-devel \ ruby-devel \ sqlite-devel \ vala-devel \ @@ -54,5 +53,5 @@ dnf -y install \ npm install -g yarn -python3 -m pip install -U pip +python3 -m ensurepip --upgrade alternatives --set python /usr/bin/python3 diff --git a/dev/release/utils-update-docs-versions.py b/dev/release/utils-update-docs-versions.py index 7ca4059214db5..ba0ddcaeb39e1 100644 --- a/dev/release/utils-update-docs-versions.py +++ b/dev/release/utils-update-docs-versions.py @@ -38,7 +38,9 @@ else: release_type = "patch" -# Update main docs version script +# Update main docs version script only when compatible version of the +# stable version isn't changed. Compatible version is ${MAJOR}.${MINOR} +# version. if release_type != "patch": with open(main_versions_path) as json_file: old_versions = json.load(json_file) @@ -47,23 +49,30 @@ stable_compatible_version = ".".join(split_version[:2]) previous_compatible_version = old_versions[1]["name"].split(" ")[0] - # Create new versions - new_versions = [ - {"name": f"{dev_compatible_version} (dev)", - "version": "dev/", - "url": "https://arrow.apache.org/docs/dev/"}, - {"name": f"{stable_compatible_version} (stable)", - "version": "", - "url": "https://arrow.apache.org/docs/", - "preferred": True}, - {"name": previous_compatible_version, - "version": f"{previous_compatible_version}/", - "url": f"https://arrow.apache.org/docs/{previous_compatible_version}/"}, - *old_versions[2:], - ] - with open(main_versions_path, 'w') as json_file: - json.dump(new_versions, json_file, indent=4) - json_file.write("\n") + # previous (compatible version) -> stable (compatible version) + # + # 13.Y.Z (13.Y) -> 14.0.0 (14.0): Update + # 14.0.0 (14.0) -> 14.0.1 (14.0): Not update + # 14.0.0 (14.0) -> 14.1.0 (14.1): Update + # 14.0.1 (14.0) -> 14.1.0 (14.1): Update + if stable_compatible_version != previous_compatible_version: + # Create new versions + new_versions = [ + {"name": f"{dev_compatible_version} (dev)", + "version": "dev/", + "url": "https://arrow.apache.org/docs/dev/"}, + {"name": f"{stable_compatible_version} (stable)", + "version": "", + "url": "https://arrow.apache.org/docs/", + "preferred": True}, + {"name": previous_compatible_version, + "version": f"{previous_compatible_version}/", + "url": f"https://arrow.apache.org/docs/{previous_compatible_version}/"}, + *old_versions[2:], + ] + with open(main_versions_path, 'w') as json_file: + json.dump(new_versions, json_file, indent=4) + json_file.write("\n") # Update R package version script @@ -76,7 +85,8 @@ previous_r_name = old_r_versions[1]["name"].split(" ")[0] previous_r_version = ".".join(previous_r_name.split(".")[:2]) -if release_type == "major": +if release_type == "major" and split_version[1:] == ["0", "0"]: + # 14.0.0 -> 15.0.0 new_r_versions = [ {"name": f"{dev_r_version} (dev)", "version": "dev/"}, {"name": f"{release_r_version} (release)", "version": ""}, @@ -84,6 +94,10 @@ *old_r_versions[2:], ] else: + # 14.0.1 -> 15.0.0 + # 14.0.0 -> 14.1.0 + # 14.0.1 -> 14.1.0 + # 14.0.0 -> 14.0.1 new_r_versions = [ {"name": f"{dev_r_version} (dev)", "version": "dev/"}, {"name": f"{release_r_version} (release)", "version": ""}, diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 05a7498a85180..ebdb493f8006e 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -23,7 +23,7 @@ # - Maven >= 3.3.9 # - JDK >=7 # - gcc >= 4.8 -# - Node.js >= 11.12 (best way is to use nvm) +# - Node.js >= 18 # - Go >= 1.19 # - Docker # @@ -153,17 +153,17 @@ verify_dir_artifact_signatures() { # verify the signature and the checksums of each artifact find $1 -name '*.asc' | while read sigfile; do artifact=${sigfile/.asc/} - gpg --verify $sigfile $artifact || exit 1 + gpg --verify $sigfile $artifact # go into the directory because the checksum files contain only the # basename of the artifact pushd $(dirname $artifact) base_artifact=$(basename $artifact) if [ -f $base_artifact.sha256 ]; then - ${sha256_verify} $base_artifact.sha256 || exit 1 + ${sha256_verify} $base_artifact.sha256 fi if [ -f $base_artifact.sha512 ]; then - ${sha512_verify} $base_artifact.sha512 || exit 1 + ${sha512_verify} $base_artifact.sha512 fi popd done @@ -171,7 +171,7 @@ verify_dir_artifact_signatures() { test_binary() { show_header "Testing binary artifacts" - maybe_setup_conda || exit 1 + maybe_setup_conda local download_dir=binaries mkdir -p ${download_dir} @@ -195,8 +195,6 @@ test_apt() { "arm64v8/ubuntu:focal" \ "ubuntu:jammy" \ "arm64v8/ubuntu:jammy" \ - "ubuntu:lunar" \ - "arm64v8/ubuntu:lunar" \ "ubuntu:mantic" \ "arm64v8/ubuntu:mantic"; do \ case "${target}" in @@ -567,7 +565,7 @@ maybe_setup_nodejs() { test_package_java() { show_header "Build and test Java libraries" - maybe_setup_conda maven openjdk || exit 1 + maybe_setup_conda maven openjdk pushd java if [ ${TEST_JAVA} -gt 0 ]; then @@ -581,7 +579,7 @@ test_and_install_cpp() { show_header "Build, install and test C++ libraries" # Build and test C++ - maybe_setup_virtualenv numpy || exit 1 + maybe_setup_virtualenv numpy maybe_setup_conda \ --file ci/conda_env_unix.txt \ --file ci/conda_env_cpp.txt \ @@ -589,7 +587,7 @@ test_and_install_cpp() { ncurses \ numpy \ sqlite \ - compilers || exit 1 + compilers if [ "${USE_CONDA}" -gt 0 ]; then DEFAULT_DEPENDENCY_SOURCE="CONDA" @@ -673,8 +671,8 @@ test_python() { show_header "Build and test Python libraries" # Build and test Python - maybe_setup_virtualenv "cython>=0.29.31" numpy "setuptools_scm<8.0.0" setuptools || exit 1 - maybe_setup_conda --file ci/conda_env_python.txt || exit 1 + maybe_setup_virtualenv "cython>=0.29.31" numpy "setuptools_scm<8.0.0" setuptools + maybe_setup_conda --file ci/conda_env_python.txt if [ "${USE_CONDA}" -gt 0 ]; then CMAKE_PREFIX_PATH="${CONDA_BACKUP_CMAKE_PREFIX_PATH}:${CMAKE_PREFIX_PATH}" @@ -748,8 +746,8 @@ test_glib() { show_header "Build and test C GLib libraries" # Build and test C GLib - maybe_setup_conda glib gobject-introspection meson ninja ruby || exit 1 - maybe_setup_virtualenv meson || exit 1 + maybe_setup_conda glib gobject-introspection meson ninja ruby + maybe_setup_virtualenv meson # Install bundler if doesn't exist if ! bundle --version; then @@ -783,8 +781,8 @@ test_ruby() { show_header "Build and test Ruby libraries" # required dependencies are installed by test_glib - maybe_setup_conda || exit 1 - maybe_setup_virtualenv || exit 1 + maybe_setup_conda + maybe_setup_virtualenv which ruby which bundle @@ -846,8 +844,8 @@ test_csharp() { test_js() { show_header "Build and test JavaScript libraries" - maybe_setup_nodejs || exit 1 - maybe_setup_conda nodejs=18 || exit 1 + maybe_setup_nodejs + maybe_setup_conda nodejs=18 if ! command -v yarn &> /dev/null; then npm install yarn @@ -869,8 +867,8 @@ test_js() { test_go() { show_header "Build and test Go libraries" - maybe_setup_go || exit 1 - maybe_setup_conda compilers go=1.19 || exit 1 + maybe_setup_go + maybe_setup_conda compilers go=1.19 pushd go go get -v ./... @@ -902,8 +900,8 @@ test_go() { test_integration() { show_header "Build and execute integration tests" - maybe_setup_conda || exit 1 - maybe_setup_virtualenv || exit 1 + maybe_setup_conda + maybe_setup_virtualenv pip install -e dev/archery[integration] @@ -1069,8 +1067,10 @@ test_linux_wheels() { local pyver=${python/m} for platform in ${platform_tags}; do show_header "Testing Python ${pyver} wheel for platform ${platform}" - CONDA_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_conda || exit 1 - VENV_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_virtualenv || continue + CONDA_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_conda + if ! VENV_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_virtualenv; then + continue + fi pip install pyarrow-${TEST_PYARROW_VERSION:-${VERSION}}-cp${pyver/.}-cp${python/.}-${platform}.whl INSTALL_PYARROW=OFF ARROW_GCS=${check_gcs} ${ARROW_DIR}/ci/scripts/python_wheel_unix_test.sh ${ARROW_SOURCE_DIR} done @@ -1102,8 +1102,10 @@ test_macos_wheels() { check_s3=OFF fi - CONDA_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_conda || exit 1 - VENV_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_virtualenv || continue + CONDA_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_conda + if ! VENV_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_virtualenv; then + continue + fi pip install pyarrow-${VERSION}-cp${pyver/.}-cp${python/.}-${platform}.whl INSTALL_PYARROW=OFF ARROW_FLIGHT=${check_flight} ARROW_GCS=${check_gcs} ARROW_S3=${check_s3} \ @@ -1114,7 +1116,7 @@ test_macos_wheels() { test_wheels() { show_header "Downloading Python wheels" - maybe_setup_conda python || exit 1 + maybe_setup_conda python local wheels_dir= if [ "${SOURCE_KIND}" = "local" ]; then @@ -1153,7 +1155,7 @@ test_wheels() { test_jars() { show_header "Testing Java JNI jars" - maybe_setup_conda maven python || exit 1 + maybe_setup_conda maven python local download_dir=${ARROW_TMPDIR}/jars mkdir -p ${download_dir} diff --git a/dev/tasks/conda-recipes/README.md b/dev/tasks/conda-recipes/README.md index fc40733249b95..bb083c3ef07f4 100644 --- a/dev/tasks/conda-recipes/README.md +++ b/dev/tasks/conda-recipes/README.md @@ -19,7 +19,7 @@ # Conda Forge recipes -This directory must be migrated periodically with the upstrem updates of +This directory must be migrated periodically with the upstream updates of [arrow-cpp-feedstock][arrow-cpp-feedstock], [parquet-cpp-feedstock][parquet-cpp-feedstock]. conda-forge repositories because of multiple vendored files. diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 371b62245bb72..b8ffbfdb715b6 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -1,4 +1,4 @@ -# NOTE: In constrast to the conda-forge recipe, ARROW_VERSION is a templated variable here. +# NOTE: In contrast to the conda-forge recipe, ARROW_VERSION is a templated variable here. {% set version = ARROW_VERSION %} {% set cuda_enabled = cuda_compiler_version != "None" %} {% set build_ext_version = ARROW_VERSION %} diff --git a/dev/tasks/java-jars/README.md b/dev/tasks/java-jars/README.md index 758f74c95fbf1..216c7198d3239 100644 --- a/dev/tasks/java-jars/README.md +++ b/dev/tasks/java-jars/README.md @@ -20,7 +20,7 @@ limitations under the License. This directory is responsible to generate the jar files for the Arrow components that depend on C++ shared libraries to execute. -The Arrow C++ libraries are compiled both on MacOS and Linux distributions, with their dependencies linked statically, and they are added +The Arrow C++ libraries are compiled both on macOS and Linux distributions, with their dependencies linked statically, and they are added in the jars at the end, so the file can be used on both systems. ## Linux Docker Image diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-lunar/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-lunar/Dockerfile deleted file mode 100644 index ec65b8d5bd257..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-lunar/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:lunar - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 221fb0caa8952..83a388c93051d 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (14.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Mon, 06 Nov 2023 22:23:27 -0000 + apache-arrow-apt-source (14.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/Rakefile b/dev/tasks/linux-packages/apache-arrow-release/Rakefile index dc2411f0de727..b7e1c87d13650 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/Rakefile +++ b/dev/tasks/linux-packages/apache-arrow-release/Rakefile @@ -43,7 +43,7 @@ class ApacheArrowReleasePackageTask < PackageTask keys_path = "#{@archive_base_name}/KEYS" download("https://www.apache.org/dyn/closer.lua?action=download&filename=arrow/KEYS", keys_path) - keys = File.read(keys_path) + keys = File.read(keys_path, encoding: "UTF-8") File.open(keys_path, "w") do |keys_file| is_ed25519_key = false deny_lists = [ @@ -54,7 +54,7 @@ class ApacheArrowReleasePackageTask < PackageTask # It seems that a subkey of this key may be related. "B90EB64A3AF15545EC8A7B8803F0D5EA3790810C", ] - is_denyed_key = false + is_denied_key = false keys.each_line do |line| case line.chomp when /\Apub\s+ed25519\s/ @@ -63,13 +63,13 @@ class ApacheArrowReleasePackageTask < PackageTask when /\Apub\s+[^\/]+\/([\h]+)\s/ short_finger_print = $1 if deny_lists.include?(short_finger_print) - is_denyed_key = true + is_denied_key = true next end when /\A\s+([\h]+)$/ long_finger_print = $1 if deny_lists.include?(long_finger_print) - is_denyed_key = true + is_denied_key = true next end when "-----END PGP PUBLIC KEY BLOCK-----" @@ -77,13 +77,13 @@ class ApacheArrowReleasePackageTask < PackageTask is_ed25519_key = false next end - if is_denyed_key - is_denyed_key = false + if is_denied_key + is_denied_key = false next end else next if is_ed25519_key - next if is_denyed_key + next if is_denied_key end keys_file.print(line) end diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 273bf32a2a8e4..245e8afeaeb1d 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 +- New upstream release. + * Thu Oct 19 2023 Raúl Cumplido - 14.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-lunar-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-lunar-arm64/from deleted file mode 100644 index 505787ef5ca87..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-lunar-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/ubuntu:lunar diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-lunar/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-lunar/Dockerfile deleted file mode 100644 index d7d53a8ca894a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-lunar/Dockerfile +++ /dev/null @@ -1,85 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=ubuntu:lunar -FROM ${FROM} - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - clang \ - clang-tools \ - cmake \ - debhelper \ - devscripts \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libgtest-dev \ - liblz4-dev \ - libmlir-15-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-dev \ - lsb-release \ - meson \ - mlir-15-tools \ - ninja-build \ - nlohmann-json3-dev \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - python3-setuptools \ - rapidjson-dev \ - tzdata \ - valac \ - zlib1g-dev && \ - if apt list | grep -q '^libcuda1'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - else \ - :; \ - fi && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 5e01d962c44d4..1f3f1bd5abd07 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (14.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Mon, 06 Nov 2023 22:23:27 -0000 + apache-arrow (14.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-sql-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-sql-glib-doc.doc-base index 74a9056ca78f4..5569fa83ed5c5 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-sql-glib-doc.doc-base +++ b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-sql-glib-doc.doc-base @@ -1,7 +1,7 @@ Document: arrow-flight-sql-glib Title: Apache Arrow Flight SQL GLib Reference Manual Author: The Apache Software Foundation -Abstract: Apache Arrow Flight SQL GLib provides a client-server framework to intract with SQL databases using Apache Arrow in-memory format and Apache Arrow Flight. +Abstract: Apache Arrow Flight SQL GLib provides a client-server framework to interact with SQL databases using Apache Arrow in-memory format and Apache Arrow Flight. Section: Programming Format: HTML diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index f61d47db2edd7..87e05558e8cda 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -864,6 +864,9 @@ Documentation for Apache Parquet GLib. %{_datadir}/gtk-doc/html/parquet-glib/ %changelog +* Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 +- New upstream release. + * Thu Oct 19 2023 Raúl Cumplido - 14.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb index da281d0ee2cf9..ecd61054daeb1 100644 --- a/dev/tasks/linux-packages/package-task.rb +++ b/dev/tasks/linux-packages/package-task.rb @@ -277,8 +277,6 @@ def apt_targets_default # "ubuntu-focal-arm64", "ubuntu-jammy", # "ubuntu-jammy-arm64", - "ubuntu-lunar", - # "ubuntu-lunar-arm64", "ubuntu-mantic", # "ubuntu-mantic-arm64", ] diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 9ee95dec15a14..8ba95af46af56 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -237,7 +237,7 @@ on: brew config brew doctor || true # The GHA runners install of python > 3.10 is incompatible with brew so we - # have to force overwritting of the symlinks + # have to force overwriting of the symlinks # see https://github.com/actions/runner-images/issues/6868 brew install --overwrite python@3.11 python@3.10 diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index c217f5901b583..35d74f1462453 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Prerequisits on the host: +# Prerequisites on the host: # - brew install bash bison coreutils ninja cmake # - sudo arrow/ci/scripts/install_python.sh macos 3.9 diff --git a/dev/tasks/r/github.macos-linux.local.yml b/dev/tasks/r/github.macos-linux.local.yml index c65eb79f613c4..8bce057059b97 100644 --- a/dev/tasks/r/github.macos-linux.local.yml +++ b/dev/tasks/r/github.macos-linux.local.yml @@ -41,7 +41,7 @@ jobs: if: contains(matrix.os, 'macOS') run: | brew install openssl - # disable sccache on macos as it timesout for unknown reasons + # disable sccache on macos as it times out for unknown reasons # see GH-33721 # brew install sccache - name: Configure dependencies (linux) diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 22dddabc10fab..2ddfd02e73134 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -211,40 +211,43 @@ jobs: matrix: platform: - { runs_on: 'windows-latest', name: "Windows"} - - { runs_on: ["self-hosted", "macos-10.13"], name: "macOS High Sierra"} - - { runs_on: ["self-hosted", "macOS", "arm64", "devops-managed"], name: "macOS Big Sur" } - r_version: - - { rtools: "{{ macros.r_release.rt }}", r: "{{ macros.r_release.ver }}" } - - { rtools: "{{ macros.r_oldrel.rt }}", r: "{{ macros.r_oldrel.ver }}" } + - { runs_on: macos-11 , name: "macOS Big Sur"} + - { runs_on: ["self-hosted", "macOS", "arm64", "devops-managed"], name: "macOS Big Sur (M1)" } + r_version: [oldrel, release] steps: - uses: r-lib/actions/setup-r@v2 # expression marker prevents the ! being parsed as yaml tag if: {{ "${{ !contains(matrix.platform.runs_on, 'self-hosted') }}" }} with: - r-version: {{ '${{ matrix.r_version.r }}' }} - rtools-version: {{ '${{ matrix.r_version.rtools }}' }} + r-version: {{ '${{ matrix.r_version }}' }} - name: Setup R Self-Hosted if: contains(matrix.platform.runs_on, 'self-hosted') run: | - if [ "{{ "${{ contains(matrix.platform.runs_on, 'arm64') }}" }}" == "true" ]; then - rig_arch="-arm64" - fi # rig is a system utility that allows for switching # between pre-installed R version on the self-hosted runners - rig default {{ '${{ matrix.r_version.r }}' }}$rig_arch + # rig add {{ '${{ matrix.r_version }}' }} #uncomment this to install latest release/oldrel + rig default {{ '${{ matrix.r_version }}' }} rig system setup-user-lib - rig system add-pak {{ macros.github_setup_local_r_repo(false, true, true)|indent }} - name: Prepare Dependency Installation shell: bash run: | tar -xzf repo/src/contrib/arrow_*.tar.gz arrow/DESCRIPTION - name: Install dependencies + if: {{ "${{ !contains(matrix.platform.runs_on, 'self-hosted') }}" }} uses: r-lib/actions/setup-r-dependencies@v2 with: working-directory: 'arrow' extra-packages: cpp11 + - name: Install dependencies self-hosted + if: {{ "${{ contains(matrix.platform.runs_on, 'self-hosted') }}" }} + shell: Rscript {0} + run: | + if (!requireNamespace("devtools", quietly = TRUE)) { + install.packages("devtools") + } + devtools::install_dev_deps('./arrow') - name: Set CRAN like openssl if: contains(matrix.platform.runs_on, 'arm64') run: | diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 15fac25d26d65..84c3cad6ac899 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -17,7 +17,7 @@ groups: # these groups are just for convenience - # makes it easier to submit related taskshttps://github.com/github/release-radar + # makes it easier to submit related tasks https://github.com/github/release-radar {############################# Packaging tasks ###############################} @@ -77,6 +77,9 @@ groups: c-glib: - test-*c-glib* + java: + - "*java*" + python: - test-*python* @@ -93,6 +96,9 @@ groups: ruby: - test-*ruby* + go: + - test*-go-* + vcpkg: - test-*vcpkg* @@ -557,7 +563,6 @@ tasks: "debian-trixie", "ubuntu-focal", "ubuntu-jammy", - "ubuntu-lunar", "ubuntu-mantic"] %} {% for architecture in ["amd64", "arm64"] %} {{ target }}-{{ architecture }}: @@ -971,11 +976,11 @@ tasks: - r-lib__libarrow__bin__darwin-arm64-openssl-3.0__arrow-{no_rc_r_version}\.zip - r-lib__libarrow__bin__darwin-x86_64-openssl-1.1__arrow-{no_rc_r_version}\.zip - r-lib__libarrow__bin__darwin-x86_64-openssl-3.0__arrow-{no_rc_r_version}\.zip - - r-pkg__bin__windows__contrib__4.1__arrow_{no_rc_r_version}\.zip + - r-pkg__bin__windows__contrib__4.3__arrow_{no_rc_r_version}\.zip - r-pkg__bin__windows__contrib__4.2__arrow_{no_rc_r_version}\.zip - - r-pkg__bin__macosx__contrib__4.1__arrow_{no_rc_r_version}\.tgz + - r-pkg__bin__macosx__big-sur-x86_64__contrib__4.3__arrow_{no_rc_r_version}\.tgz - r-pkg__bin__macosx__contrib__4.2__arrow_{no_rc_r_version}\.tgz - - r-pkg__bin__macosx__big-sur-arm64__contrib__4.1__arrow_{no_rc_r_version}\.tgz + - r-pkg__bin__macosx__big-sur-arm64__contrib__4.3__arrow_{no_rc_r_version}\.tgz - r-pkg__bin__macosx__big-sur-arm64__contrib__4.2__arrow_{no_rc_r_version}\.tgz - r-pkg__src__contrib__arrow_{no_rc_r_version}\.tar\.gz diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml index 093ae3fc5356e..618c997c2527b 100644 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ b/dev/tasks/vcpkg-tests/github.windows.yml @@ -41,7 +41,7 @@ jobs: shell: bash run: arrow/ci/scripts/download_tz_database.sh - name: Remove and Reinstall vcpkg - # When running vcpkg in Github Actions on Windows, remove the + # When running vcpkg in GitHub Actions on Windows, remove the # preinstalled vcpkg and install the newest version from source. # Versions of vcpkg rapidly stop working until updated, and # the safest and most reliable way to update vcpkg is simply diff --git a/dev/tasks/verify-rc/github.linux.amd64.yml b/dev/tasks/verify-rc/github.linux.amd64.yml index 8db6ed196bdb6..854020e85ce3d 100644 --- a/dev/tasks/verify-rc/github.linux.amd64.yml +++ b/dev/tasks/verify-rc/github.linux.amd64.yml @@ -59,10 +59,6 @@ jobs: distribution: 'temurin' java-version: '11' - - uses: actions/setup-node@v2 - with: - node-version: '16' - - name: Run verification shell: bash env: diff --git a/dev/tasks/verify-rc/github.macos.amd64.yml b/dev/tasks/verify-rc/github.macos.amd64.yml index 12dfb7d90f175..6c31b8b29d013 100644 --- a/dev/tasks/verify-rc/github.macos.amd64.yml +++ b/dev/tasks/verify-rc/github.macos.amd64.yml @@ -43,12 +43,6 @@ jobs: - name: Install System Dependencies shell: bash run: | - rm -f /usr/local/bin/2to3* - rm -f /usr/local/bin/idle* - rm -f /usr/local/bin/pydoc3* - rm -f /usr/local/bin/python3* - brew update || echo "brew update did not finish successfully" - brew install --overwrite git brew bundle --file=arrow/cpp/Brewfile brew bundle --file=arrow/c_glib/Brewfile {% endif %} @@ -62,9 +56,9 @@ jobs: with: dotnet-version: '7.0.x' - - uses: actions/setup-node@v2-beta + - uses: actions/setup-node@v4 with: - node-version: '16' + node-version: '18' - name: Run verification shell: bash diff --git a/dev/tasks/verify-rc/github.macos.arm64.yml b/dev/tasks/verify-rc/github.macos.arm64.yml index 75a061c4ddc07..c7777bb072b88 100644 --- a/dev/tasks/verify-rc/github.macos.arm64.yml +++ b/dev/tasks/verify-rc/github.macos.arm64.yml @@ -45,7 +45,7 @@ jobs: run: | brew bundle --file=arrow/cpp/Brewfile brew bundle --file=arrow/c_glib/Brewfile - export PATH="$(brew --prefix node@16)/bin:$PATH" + export PATH="$(brew --prefix node@18)/bin:$PATH" export PATH="$(brew --prefix ruby)/bin:$PATH" export PKG_CONFIG_PATH="$(brew --prefix ruby)/lib/pkgconfig" arrow/dev/release/verify-release-candidate.sh \ diff --git a/docker-compose.yml b/docker-compose.yml index e2c993ee9ea41..39cd473c2741b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,7 +33,7 @@ # command `make unittest` or `ctest --output-on-failure` the correct coredump # patterns must be set. # The kernel settings are coming from the host, so while it can be enabled from -# a running container using --priviled option the change will affect all other +# a running container using --privileged option the change will affect all other # containers, so prefer setting it explicitly, directly on the host. # WARNING: setting this will affect the host machine. # @@ -1706,7 +1706,8 @@ services: args: repo: ${REPO} arch: ${ARCH} - jdk: ${JDK} + # Use a newer JDK as it seems to improve stability + jdk: 17 # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should # be set to ${MAVEN} maven: 3.5 @@ -1716,8 +1717,9 @@ services: environment: <<: [*common, *ccache] ARCHERY_INTEGRATION_WITH_RUST: 0 - # Tell Archery where the arrow C++ binaries are located + # Tell Archery where Arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_RUST_EXE_PATH: /build/rust/debug command: ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build && /arrow/ci/scripts/integration_arrow.sh /arrow /build"] diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index bf84267aea97b..58f4554d11c16 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -33,7 +33,7 @@ } } -/* Contibuting landing page overview cards */ +/* Contributing landing page overview cards */ .contrib-card { border-radius: 0; @@ -68,7 +68,7 @@ } /* This is the bootstrap CSS style for "table-striped". Since the theme does -not yet provide an easy way to configure this globaly, it easier to simply +not yet provide an easy way to configure this globally, it easier to simply include this snippet here than updating each table in all rst files to add ":class: table-striped" */ @@ -76,7 +76,7 @@ add ":class: table-striped" */ background-color: rgba(0, 0, 0, 0.05); } -/* Iprove the vertical spacing in the C++ API docs +/* Improve the vertical spacing in the C++ API docs (ideally this should be upstreamed to the pydata-sphinx-theme */ dl.cpp dd p { diff --git a/docs/source/conf.py b/docs/source/conf.py index f11d78fe05682..cde0c2b31f8fd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -139,7 +139,7 @@ breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} breathe_default_project = "arrow_cpp" -# Overriden conditionally below +# Overridden conditionally below autodoc_mock_imports = [] # copybutton configuration diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index c893e41ff8d62..331cd833b58af 100644 --- a/docs/source/cpp/acero/developer_guide.rst +++ b/docs/source/cpp/acero/developer_guide.rst @@ -38,7 +38,7 @@ ExecNode is an abstract class with several pure virtual methods that control how -------------------------------- This method is called once at the start of the plan. Most nodes ignore this method (any -neccesary initialization should happen in the construtor or Init). However, source nodes +necessary initialization should happen in the constructor or Init). However, source nodes will typically provide a custom implementation. Source nodes should schedule whatever tasks are needed to start reading and providing the data. Source nodes are usually the primary creator of tasks in a plan. @@ -52,7 +52,7 @@ Examples ^^^^^^^^ * In the ``table_source`` node the input table is divided into batches. A task is created for - each batch and that task calls ``InputRecieved`` on the node's output. + each batch and that task calls ``InputReceived`` on the node's output. * In the ``scan`` node a task is created to start listing fragments from the dataset. Each listing task then creates tasks to read batches from the fragment, asynchronously. When the batch is full read in then a continuation schedules a new task with the exec plan. This task calls @@ -95,7 +95,7 @@ Examples This method will be called once per input. A node will call InputFinished on its output once it knows how many batches it will be sending to that output. Normally this happens when the node is -finished working. For example, a scan node will call InputFinished once it has finsihed reading +finished working. For example, a scan node will call InputFinished once it has finished reading its files. However, it could call it earlier if it knows (maybe from file metadata) how many batches will be created. @@ -173,10 +173,10 @@ There is no expectation or requirement that a node sends any remaining data it h schedules tasks (e.g. a source node) should stop producing new data. In addition to plan-wide cancellation, a node may call this method on its input if it has decided -that it has recevied all the data that it needs. However, because of parallelism, a node may still +that it has received all the data that it needs. However, because of parallelism, a node may still receive a few calls to ``InputReceived`` after it has stopped its input. -If any external reosurces are used then cleanup should happen as part of this call. +If any external resources are used then cleanup should happen as part of this call. Examples ^^^^^^^^ @@ -194,7 +194,7 @@ Initialization / Construction / Destruction Simple initialization logic (that cannot error) can be done in the constructor. If the initialization logic may return an invalid status then it can either be done in the exec node's factory method or the ``Init`` method. The factory method is preferred for simple validation. The ``Init`` method is -preferred if the intialization might do expensive allocation or other resource consumption. ``Init`` will +preferred if the initialization might do expensive allocation or other resource consumption. ``Init`` will always be called before ``StartProducing`` is called. Initialization could also be done in ``StartProducing`` but keep in mind that other nodes may have started by that point. @@ -264,7 +264,7 @@ have 20 files and 10 cores and you want to read and sort all the data. You coul 2 files to read and sort those files. Then you could create one extra plan that takes the input from these 10 child plans and merges the 10 input streams in a sorted fashion. -This approach is popular because it is how queries are distributed across mulitple servers and so it +This approach is popular because it is how queries are distributed across multiple servers and so it is widely supported and well understood. Acero does not do this today but there is no reason to prevent it. Adding shuffle & partition nodes to Acero should be a high priority and would enable Acero to be used by distributed systems. Once that has been done then it should be possible to do a local shuffle (local @@ -308,7 +308,7 @@ more complex to implement. Due to a lack of standard C++ async APIs, Acero uses a combination of the two approaches. Acero has two thread pools. The first is the CPU thread pool. This thread pool has one thread per core. Tasks in this thread pool should never -block (beyond minor delays for synchornization) and should generally be actively using CPU as much as possible. Threads +block (beyond minor delays for synchronization) and should generally be actively using CPU as much as possible. Threads on the I/O thread pool are expected to spend most of the time idle. They should avoid doing any CPU-intensive work. Their job is basically to wait for data to be available and schedule follow-up tasks on the CPU thread pool. @@ -329,7 +329,7 @@ exec nodes, scan, project, and then filter (this is a very common use case). No In a task-per-operator model we would have tasks like "Scan Batch 5", "Project Batch 5", and "Filter Batch 5". Each of those tasks is potentially going to access the same data. For example, maybe the `project` and `filter` nodes need to read the same column. A column which is intially created in a decode phase of the `scan` node. To maximize cache -utiliziation we would need to carefully schedule our tasks to ensure that all three of those tasks are run consecutively +utilization we would need to carefully schedule our tasks to ensure that all three of those tasks are run consecutively and assigned to the same CPU core. To avoid this problem we design tasks that run through as many nodes as possible before the task ends. This sequence @@ -378,7 +378,7 @@ yet had to address this problem. Let's go through some common situations: locality. However, since Acero uses a task-per-pipeline model there isn't much lost opportunity for cache parallelism that a scheduler could reclaim. Tasks only end when there is no more work that can be done with the data. -While there is not much prioritzation in place in Acero today we do have the tools to apply it should we need to. +While there is not much prioritization in place in Acero today we do have the tools to apply it should we need to. .. note:: In addition to the AsyncTaskScheduler there is another class called the TaskScheduler. This class predates the @@ -391,7 +391,7 @@ Intra-node Parallelism Some nodes can potentially exploit parallelism within a task. For example, in the scan node we can decode columns in parallel. In the hash join node, parallelism is sometimes exploited for complex tasks such as -building the hash table. This sort of parallelism is less common but not neccesarily discouraged. Profiling should +building the hash table. This sort of parallelism is less common but not necessarily discouraged. Profiling should be done first though to ensure that this extra parallelism will be helpful in your workload. All Work Happens in Tasks @@ -412,7 +412,7 @@ Ordered Execution ================= Some nodes either establish an ordering to their outgoing batches or they need to be able to process batches in order. -Acero handles ordering using the `batch_index` property on an ExecBatch. If a node has a determinstic output order +Acero handles ordering using the `batch_index` property on an ExecBatch. If a node has a deterministic output order then it should apply a batch index on batches that it emits. For example, the OrderByNode applies a new ordering to batches (regardless of the incoming ordering). The scan node is able to attach an implicit ordering to batches which reflects the order of the rows in the files being scanned. @@ -458,7 +458,7 @@ Profiling & Tracing =================== Acero's tracing is currently half-implemented and there are major gaps in profiling tools. However, there has been some -effort at tracing with open telemetry and most of the neccesary pieces are in place. The main thing currently lacking is +effort at tracing with open telemetry and most of the necessary pieces are in place. The main thing currently lacking is some kind of effective visualization of the tracing results. In order to use the tracing that is present today you will need to build with Arrow with `ARROW_WITH_OPENTELEMETRY=ON`. @@ -521,7 +521,7 @@ any particular engine design. For example, the hash join node uses utilities su and an exec batch builder. Other places share implementations of sequencing queues and row segmenters. The node itself should be kept minimal and simply maps from Acero to the abstraction. -This helps to decouple designs from Acero's design details and allows them to be more resilant to changes in the +This helps to decouple designs from Acero's design details and allows them to be more resilient to changes in the engine. It also helps to promote these abstractions as capabilities on their own. Either for use in other engines or for potential new additions to pyarrow as compute utilities. @@ -642,7 +642,7 @@ OrderBySink and SelectKSink --------------------------- These two exec nodes provided custom sink implementations. They were written before ordered execution -was added to Acero and were the only way to generate ordered ouptut. However, they had to be placed +was added to Acero and were the only way to generate ordered output. However, they had to be placed at the end of a plan and the fact that they were custom sink nodes made them difficult to describe with Declaration. The OrderByNode and FetchNode replace these. These are kept at the moment until existing bindings move away from them. @@ -680,7 +680,7 @@ Because of this, we highly recommend taking the following steps: * Any PR will need to have the following: - * Unit tests convering the new functionality + * Unit tests converting the new functionality * Microbenchmarks if there is any significant compute work going on @@ -688,5 +688,5 @@ Because of this, we highly recommend taking the following steps: * Updates to the API reference and this guide - * Passing CI (you can enable Github Actions on your fork and that will allow most CI jobs to run before + * Passing CI (you can enable GitHub Actions on your fork and that will allow most CI jobs to run before you create your PR) diff --git a/docs/source/cpp/acero/overview.rst b/docs/source/cpp/acero/overview.rst index 751b8d2c28d76..c569f82b099b6 100644 --- a/docs/source/cpp/acero/overview.rst +++ b/docs/source/cpp/acero/overview.rst @@ -58,7 +58,7 @@ A Library for Data Scientists Acero is not intended to be used directly by data scientists. It is expected that end users will typically be using some kind of frontend. For example, Pandas, Ibis, or SQL. The API for Acero is focused around capabilities and available algorithms. -However, such users may be intersted in knowing more about how Acero works so that +However, such users may be interested in knowing more about how Acero works so that they can better understand how the backend processing for their libraries operates. A Database @@ -149,7 +149,7 @@ strings to uppercase strings would not be a part of the core Arrow library becau require examining the contents of the array. The compute module expands on the core library and provides functions which analyze and -transform data. The compute module's capabilites are all exposed via a function registry. +transform data. The compute module's capabilities are all exposed via a function registry. An Arrow "function" accepts zero or more arrays, batches, or tables, and produces an array, batch, or table. In addition, function calls can be combined, along with field references and literals, to form an expression (a tree of function calls) which the compute module can diff --git a/docs/source/cpp/acero/substrait.rst b/docs/source/cpp/acero/substrait.rst index 0d1c5bd02f8ae..797b2407f93cd 100644 --- a/docs/source/cpp/acero/substrait.rst +++ b/docs/source/cpp/acero/substrait.rst @@ -229,7 +229,7 @@ Functions for the functions ``and``, ``or``, ``xor`` * Substrait has not yet clearly identified the form that URIs should take for - standard functions. Acero will look for the URIs to the ``main`` Github branch. + standard functions. Acero will look for the URIs to the ``main`` GitHub branch. In other words, for the file ``functions_arithmetic.yaml`` Acero expects ``https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml`` diff --git a/docs/source/cpp/acero/user_guide.rst b/docs/source/cpp/acero/user_guide.rst index 333149caa7bb7..eca1a0104708b 100644 --- a/docs/source/cpp/acero/user_guide.rst +++ b/docs/source/cpp/acero/user_guide.rst @@ -171,7 +171,7 @@ can support all of these cases and can even support unique and custom situations There are pre-defined source nodes that cover the most common input scenarios. These are listed below. However, if your source data is unique then you will need to use the generic ``source`` node. This node expects you to -provide an asycnhronous stream of batches and is covered in more detail :ref:`here `. +provide an asynchronous stream of batches and is covered in more detail :ref:`here `. .. _ExecNode List: @@ -710,7 +710,7 @@ defining a join. The hash_join supports `_. Also the join-key (i.e. the column(s) to join on), and suffixes (i.e a suffix term like "_x" which can be appended as a suffix for column names duplicated in both left and right -relations.) can be set via the the join options. +relations.) can be set via the join options. `Read more on hash-joins `_. diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 44f43cbc877ca..47af9764150e5 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -155,7 +155,7 @@ is signed. For example: | float32, int64 | float32 | int64 is wider, still promotes to float32 | +-------------------+----------------------+------------------------------------------------+ -In particulary, note that comparing a ``uint64`` column to an ``int16`` column +In particular, note that comparing a ``uint64`` column to an ``int16`` column may emit an error if one of the ``uint64`` values cannot be expressed as the common type ``int64`` (for example, ``2 ** 63``). @@ -1622,10 +1622,10 @@ Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identidy element +their input using a given binary associative operation with an identity element (a monoid) and output an array containing the corresponding intermediate running values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are alsoavailable in an overflow-checking variant, +do not detect overflow. They are also available in an overflow-checking variant, suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst index 922fef1498b9c..4e1fe76b4d6f2 100644 --- a/docs/source/cpp/datatypes.rst +++ b/docs/source/cpp/datatypes.rst @@ -186,7 +186,7 @@ here is how one might sum across columns of arbitrary numeric types: // Default implementation arrow::Status Visit(const arrow::Array& array) { - return arrow::Status::NotImplemented("Can not compute sum for array of type ", + return arrow::Status::NotImplemented("Cannot compute sum for array of type ", array.type()->ToString()); } diff --git a/docs/source/cpp/examples/compute_and_write_example.rst b/docs/source/cpp/examples/compute_and_write_example.rst index c4480a5f5cdf1..e66d3ced55d0c 100644 --- a/docs/source/cpp/examples/compute_and_write_example.rst +++ b/docs/source/cpp/examples/compute_and_write_example.rst @@ -23,6 +23,6 @@ Compute and Write CSV Example The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside the source tree contains an example of creating a table of two numerical columns -and then comparing the magnitudes of the entries in the columns and wrting out to +and then comparing the magnitudes of the entries in the columns and writing out to a CSV file with the column entries and their comparisons. The code in the example is documented. diff --git a/docs/source/cpp/examples/dataset_skyhook_scan_example.rst b/docs/source/cpp/examples/dataset_skyhook_scan_example.rst index 75a3954cf3dbd..4f7d558dcf07a 100644 --- a/docs/source/cpp/examples/dataset_skyhook_scan_example.rst +++ b/docs/source/cpp/examples/dataset_skyhook_scan_example.rst @@ -26,8 +26,8 @@ The file ``cpp/examples/arrow/dataset_skyhook_scan_example.cc`` located inside the source tree contains an example of using Skyhook to offload filters and projections to a Ceph cluster. -Instuctions -=========== +Instructions +============ .. note:: The instructions below are for Ubuntu 20.04 or above. diff --git a/docs/source/cpp/gandiva.rst b/docs/source/cpp/gandiva.rst index 3686f94af0ea6..07b07bee7ac4e 100644 --- a/docs/source/cpp/gandiva.rst +++ b/docs/source/cpp/gandiva.rst @@ -40,119 +40,27 @@ pre-compiled into LLVM IR (intermediate representation). .. _LLVM: https://llvm.org/ -Building Expressions -==================== - -Gandiva provides a general expression representation where expressions are -represented by a tree of nodes. The expression trees are built using -:class:`TreeExprBuilder`. The leaves of the expression tree are typically -field references, created by :func:`TreeExprBuilder::MakeField`, and -literal values, created by :func:`TreeExprBuilder::MakeLiteral`. Nodes -can be combined into more complex expression trees using: - -* :func:`TreeExprBuilder::MakeFunction` to create a function - node. (You can call :func:`GetRegisteredFunctionSignatures` to - get a list of valid function signatures.) -* :func:`TreeExprBuilder::MakeIf` to create if-else logic. -* :func:`TreeExprBuilder::MakeAnd` and :func:`TreeExprBuilder::MakeOr` - to create boolean expressions. (For "not", use the ``not(bool)`` function in ``MakeFunction``.) -* :func:`TreeExprBuilder::MakeInExpressionInt32` and the other "in expression" - functions to create set membership tests. - -Each of these functions create new composite nodes, which contain the leaf nodes -(literals and field references) or other composite nodes as children. By -composing these, you can create arbitrarily complex expression trees. - -Once an expression tree is built, they are wrapped in either :class:`Expression` -or :class:`Condition`, depending on how they will be used. -``Expression`` is used in projections while ``Condition`` is used in filters. - -As an example, here is how to create an Expression representing ``x + 3`` and a -Condition representing ``x < 3``: - -.. literalinclude:: ../../../cpp/examples/arrow/gandiva_example.cc - :language: cpp - :start-after: (Doc section: Create expressions) - :end-before: (Doc section: Create expressions) - :dedent: 2 - - -Projectors and Filters -====================== - -Gandiva's two execution kernels are :class:`Projector` and -:class:`Filter`. ``Projector`` consumes a record batch and projects -into a new record batch. ``Filter`` consumes a record batch and produces a -:class:`SelectionVector` containing the indices that matched the condition. - -For both ``Projector`` and ``Filter``, optimization of the expression IR happens -when creating instances. They are compiled against a static schema, so the -schema of the record batches must be known at this point. - -Continuing with the ``expression`` and ``condition`` created in the previous -section, here is an example of creating a Projector and a Filter: - -.. literalinclude:: ../../../cpp/examples/arrow/gandiva_example.cc - :language: cpp - :start-after: (Doc section: Create projector and filter) - :end-before: (Doc section: Create projector and filter) - :dedent: 2 - -Once a Projector or Filter is created, it can be evaluated on Arrow record batches. -These execution kernels are single-threaded on their own, but are designed to be -reused to process distinct record batches in parallel. - -Evaluating projections ----------------------- - -Execution is performed with :func:`Projector::Evaluate`. This outputs -a vector of arrays, which can be passed along with the output schema to -:func:`arrow::RecordBatch::Make()`. - -.. literalinclude:: ../../../cpp/examples/arrow/gandiva_example.cc - :language: cpp - :start-after: (Doc section: Evaluate projection) - :end-before: (Doc section: Evaluate projection) - :dedent: 2 - -Evaluating filters ------------------- - -:func:`Filter::Evaluate` produces :class:`SelectionVector`, -a vector of row indices that matched the filter condition. The selection vector -is a wrapper around an arrow integer array, parameterized by bitwidth. When -creating the selection vector (you must initialize it *before* passing to -``Evaluate()``), you must choose the bitwidth, which determines the max index -value it can hold, and the max number of slots, which determines how many indices -it may contain. In general, the max number of slots should be set to your batch -size and the bitwidth the smallest integer size that can represent all integers -less than the batch size. For example, if your batch size is 100k, set the -maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which -would be too small). - -Once ``Evaluate()`` has been run and the :class:`SelectionVector` is -populated, use the :func:`SelectionVector::ToArray()` method to get -the underlying array and then :func:`::arrow::compute::Take()` to materialize the -output record batch. - -.. literalinclude:: ../../../cpp/examples/arrow/gandiva_example.cc - :language: cpp - :start-after: (Doc section: Evaluate filter) - :end-before: (Doc section: Evaluate filter) - :dedent: 2 - -Evaluating projections and filters ----------------------------------- - -Finally, you can also project while apply a selection vector, with -:func:`Projector::Evaluate()`. To do so, first make sure to initialize the -:class:`Projector` with :func:`SelectionVector::GetMode()` so that the projector -compiles with the correct bitwidth. Then you can pass the -:class:`SelectionVector` into the :func:`Projector::Evaluate()` method. - - -.. literalinclude:: ../../../cpp/examples/arrow/gandiva_example.cc - :language: cpp - :start-after: (Doc section: Evaluate filter and projection) - :end-before: (Doc section: Evaluate filter and projection) - :dedent: 2 +Expression, Projector and Filter +================================ +To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, +including the creation of function nodes, if-else logic, and boolean expressions. +Subsequently, leverage ``Projector`` or ``Filter`` execution kernels to efficiently evaluate these expressions. +See :doc:`./gandiva/expr_projector_filter` for more details. + + +External Functions Development +============================== +Gandiva offers the capability of integrating external functions, encompassing +both C functions and IR functions. This feature broadens the spectrum of +functions that can be applied within Gandiva expressions. For developers +looking to customize and enhance their computational solutions, +Gandiva provides the opportunity to develop and register their own external +functions, thus allowing for a more tailored and flexible use of the Gandiva +environment. +See :doc:`./gandiva/external_func` for more details. + +.. toctree:: + :maxdepth: 2 + + gandiva/expr_projector_filter + gandiva/external_func \ No newline at end of file diff --git a/docs/source/cpp/gandiva/expr_projector_filter.rst b/docs/source/cpp/gandiva/expr_projector_filter.rst new file mode 100644 index 0000000000000..c960d1d869fe5 --- /dev/null +++ b/docs/source/cpp/gandiva/expr_projector_filter.rst @@ -0,0 +1,137 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========================================= +Gandiva Expression, Projector, and Filter +========================================= + +Building Expressions +==================== + +Gandiva provides a general expression representation where expressions are +represented by a tree of nodes. The expression trees are built using +:class:`TreeExprBuilder`. The leaves of the expression tree are typically +field references, created by :func:`TreeExprBuilder::MakeField`, and +literal values, created by :func:`TreeExprBuilder::MakeLiteral`. Nodes +can be combined into more complex expression trees using: + +* :func:`TreeExprBuilder::MakeFunction` to create a function + node. (You can call :func:`GetRegisteredFunctionSignatures` to + get a list of valid function signatures.) +* :func:`TreeExprBuilder::MakeIf` to create if-else logic. +* :func:`TreeExprBuilder::MakeAnd` and :func:`TreeExprBuilder::MakeOr` + to create boolean expressions. (For "not", use the ``not(bool)`` function in ``MakeFunction``.) +* :func:`TreeExprBuilder::MakeInExpressionInt32` and the other "in expression" + functions to create set membership tests. + +Each of these functions create new composite nodes, which contain the leaf nodes +(literals and field references) or other composite nodes as children. By +composing these, you can create arbitrarily complex expression trees. + +Once an expression tree is built, they are wrapped in either :class:`Expression` +or :class:`Condition`, depending on how they will be used. +``Expression`` is used in projections while ``Condition`` is used in filters. + +As an example, here is how to create an Expression representing ``x + 3`` and a +Condition representing ``x < 3``: + +.. literalinclude:: ../../../../cpp/examples/arrow/gandiva_example.cc + :language: cpp + :start-after: (Doc section: Create expressions) + :end-before: (Doc section: Create expressions) + :dedent: 2 + + +Projectors and Filters +====================== + +Gandiva's two execution kernels are :class:`Projector` and +:class:`Filter`. ``Projector`` consumes a record batch and projects +into a new record batch. ``Filter`` consumes a record batch and produces a +:class:`SelectionVector` containing the indices that matched the condition. + +For both ``Projector`` and ``Filter``, optimization of the expression IR happens +when creating instances. They are compiled against a static schema, so the +schema of the record batches must be known at this point. + +Continuing with the ``expression`` and ``condition`` created in the previous +section, here is an example of creating a Projector and a Filter: + +.. literalinclude:: ../../../../cpp/examples/arrow/gandiva_example.cc + :language: cpp + :start-after: (Doc section: Create projector and filter) + :end-before: (Doc section: Create projector and filter) + :dedent: 2 + +Once a Projector or Filter is created, it can be evaluated on Arrow record batches. +These execution kernels are single-threaded on their own, but are designed to be +reused to process distinct record batches in parallel. + +Evaluating projections +---------------------- + +Execution is performed with :func:`Projector::Evaluate`. This outputs +a vector of arrays, which can be passed along with the output schema to +:func:`arrow::RecordBatch::Make()`. + +.. literalinclude:: ../../../../cpp/examples/arrow/gandiva_example.cc + :language: cpp + :start-after: (Doc section: Evaluate projection) + :end-before: (Doc section: Evaluate projection) + :dedent: 2 + +Evaluating filters +------------------ + +:func:`Filter::Evaluate` produces :class:`SelectionVector`, +a vector of row indices that matched the filter condition. The selection vector +is a wrapper around an arrow integer array, parameterized by bitwidth. When +creating the selection vector (you must initialize it *before* passing to +``Evaluate()``), you must choose the bitwidth, which determines the max index +value it can hold, and the max number of slots, which determines how many indices +it may contain. In general, the max number of slots should be set to your batch +size and the bitwidth the smallest integer size that can represent all integers +less than the batch size. For example, if your batch size is 100k, set the +maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which +would be too small). + +Once ``Evaluate()`` has been run and the :class:`SelectionVector` is +populated, use the :func:`SelectionVector::ToArray()` method to get +the underlying array and then :func:`::arrow::compute::Take()` to materialize the +output record batch. + +.. literalinclude:: ../../../../cpp/examples/arrow/gandiva_example.cc + :language: cpp + :start-after: (Doc section: Evaluate filter) + :end-before: (Doc section: Evaluate filter) + :dedent: 2 + +Evaluating projections and filters +---------------------------------- + +Finally, you can also project while apply a selection vector, with +:func:`Projector::Evaluate()`. To do so, first make sure to initialize the +:class:`Projector` with :func:`SelectionVector::GetMode()` so that the projector +compiles with the correct bitwidth. Then you can pass the +:class:`SelectionVector` into the :func:`Projector::Evaluate()` method. + + +.. literalinclude:: ../../../../cpp/examples/arrow/gandiva_example.cc + :language: cpp + :start-after: (Doc section: Evaluate filter and projection) + :end-before: (Doc section: Evaluate filter and projection) + :dedent: 2 \ No newline at end of file diff --git a/docs/source/cpp/gandiva/external_func.mmd b/docs/source/cpp/gandiva/external_func.mmd new file mode 100644 index 0000000000000..755424bfa4203 --- /dev/null +++ b/docs/source/cpp/gandiva/external_func.mmd @@ -0,0 +1,49 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +graph TD + Rust(Rust) --> CFunction(C function) + Cpp(C++) --> CFunction + OtherLangs(Other langs) --> CFunction + + C(C) --clang--> LLVMIR(LLVM IR) + Cpp1(C++) --clang--> LLVMIR + OtherLangs1(Other langs) --rustc/etc--> LLVMIR + + LLVMIR --LLVM toolchain--> LLVMBitcode(LLVM bitcode) + + CFunction --> Application(application) + LLVMBitcode --> Application + + Application --Register--> FunctionRegistry + + subgraph Gandiva + BuiltInIRFunctions(built-in IR functions) --> LLVMGenerator(LLVMGenerator) + BuiltInCFunctions(built-in C functions) --> LLVMGenerator + + FunctionRegistry(FunctionRegistry) --> LLVMGenerator + + + LLVMGenerator --> LLVMJITEngine(LLVM JIT engine) + + LLVMJITEngine --codegen--> MachineCode(machine code) + end + +classDef node stroke-width:0px; +class Rust,Cpp,OtherLangs,C,Cpp1,OtherLangs1,LLVMIR,LLVMBitcode,CFunction,Application,BuiltInIRFunctions,BuiltInCFunctions,FunctionRegistry,LLVMGenerator,LLVMJITEngine,MachineCode node; +classDef subGraph fill:#f5f5f5,stroke:#5a5a5a,stroke-width:2px,rx:10,ry:10; +class Gandiva subGraph; \ No newline at end of file diff --git a/docs/source/cpp/gandiva/external_func.png b/docs/source/cpp/gandiva/external_func.png new file mode 100644 index 0000000000000..3b17483ded4d8 Binary files /dev/null and b/docs/source/cpp/gandiva/external_func.png differ diff --git a/docs/source/cpp/gandiva/external_func.rst b/docs/source/cpp/gandiva/external_func.rst new file mode 100644 index 0000000000000..cdd8fc82e59db --- /dev/null +++ b/docs/source/cpp/gandiva/external_func.rst @@ -0,0 +1,272 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============================================ +Gandiva External Functions Development Guide +============================================ + +Introduction +============ + +Gandiva, as an analytical expression compiler framework, extends its functionality through external functions. This guide is focused on helping developers understand, create, and integrate external functions into Gandiva. External functions are user-defined, third-party functions that can be used in Gandiva expressions. + +Overview of External Function Types in Gandiva +============================================== + +Gandiva supports two primary types of external functions: + +* C Functions: Functions conforming to the C calling convention. Developers can implement functions in various languages (like C++, Rust, C, or Zig) and expose them as C functions to Gandiva. + +* IR Functions: Functions implemented in LLVM Intermediate Representation (LLVM IR). These can be written in multiple languages and then compiled into LLVM IR to be registered in Gandiva. + +Choosing the Right Type of External Function for Your Needs +----------------------------------------------------------- + +When integrating external functions into Gandiva, it's crucial to select the type that best fits your specific requirements. Here are the key distinctions between C Functions and IR Functions to guide your decision: + +* C Functions + * **Language Flexibility:** C functions offer the flexibility to implement your logic in a preferred programming language and subsequently expose them as C functions. + * **Broad Applicability:** They are generally a go-to choice for a wide range of use cases due to their compatibility and ease of integration. + +* IR Functions + * **Recommended Use Cases:** IR functions excel in handling straightforward tasks that do not require elaborate logic or dependence on sophisticated third-party libraries. Unlike C functions, IR functions have the advantage of being inlinable, which is particularly beneficial for simple operations where the invocation overhead constitutes a significant expense. Additionally, they are an ideal choice for projects that are already integrated with the LLVM toolchain. + * **IR Compilation Requirement:** For IR functions, the entire implementation, including any third-party libraries used, must be compiled into LLVM IR. This might affect performance, especially if the dependent libraries are complex. + * **Limitations in Capabilities:** Certain advanced features, such as using thread-local variables, are not supported in IR functions. This is due to the limitations of the current JIT (Just-In-Time) engine utilized internally by Gandiva. + +.. image:: ./external_func.png + :alt: External C functions and IR functions integrating with Gandiva + +External function registration +============================== + +To make a function available to Gandiva, you need to register it as an external function, providing both a function's metadata and its implementation to Gandiva. + +Metadata Registration Using the ``NativeFunction`` Class +-------------------------------------------------------- + +To register a function in Gandiva, use the ``gandiva::NativeFunction`` class. This class captures both the signature and metadata of the external function. + +Constructor Details for ``gandiva::NativeFunction``: + +.. code-block:: cpp + + NativeFunction(const std::string& base_name, const std::vector& aliases, + const DataTypeVector& param_types, const DataTypePtr& ret_type, + the ResultNullableType& result_nullable_type, std::string pc_name, + int32_t flags = 0); + +The ``NativeFunction`` class is used to define the metadata for an external function. Here is a breakdown of its constructor parameters: + +* ``base_name``: The name of the function as it will be used in expressions. +* ``aliases``: A list of alternative names for the function. +* ``param_types``: A vector of ``arrow::DataType`` objects representing the types of the parameters that the function accepts. +* ``ret_type``: A ``std::shared_ptr`` representing the return type of the function. +* ``result_nullable_type``: This parameter indicates whether the result can be null, based on the nullability of the input arguments. It can take one of the following values: + * ``ResultNullableType::kResultNullIfNull``: result validity is an intersection of the validity of the children. + * ``ResultNullableType::kResultNullNever``: result is always valid. + * ``ResultNullableType::kResultNullInternal``: result validity depends on some internal logic. +* ``pc_name``: The name of the corresponding precompiled function. + * Typically, this name follows the convention ``{base_name}`` + ``_{param1_type}`` + ``{param2_type}`` + ... + ``{paramN_type}``. For example, if the base name is ``add`` and the function takes two ``int32`` parameters and returns an ``int32``, the precompiled function name would be ``add_int32_int32``, but this convention is not mandatory as long as you can guarantee its uniqueness. +* ``flags``: Optional flags for additional function attributes (default is 0). Please check out ``NativeFunction::kNeedsContext``, ``NativeFunction::kNeedsFunctionHolder``, and ``NativeFunction::kCanReturnErrors`` for more details. + +After the function is registered, its implementation needs to be provided via either a C function pointer or a LLVM IR function. + +External C functions +-------------------- + +External C functions can be authored in different languages and exposed as C functions. Compatibility with Gandiva's type system is crucial. + +C Function Signature +******************** + +Signature Mapping +~~~~~~~~~~~~~~~~~ + +Not all Arrow data types are supported in Gandiva. The following table lists the mapping between Gandiva external function signature types and the C function signature types: + ++-------------------------------------+-------------------+ +| Gandiva type (arrow data type) | C function type | ++=====================================+===================+ +| int8 | int8_t | ++-------------------------------------+-------------------+ +| int16 | int16_t | ++-------------------------------------+-------------------+ +| int32 | int32_t | ++-------------------------------------+-------------------+ +| int64 | int64_t | ++-------------------------------------+-------------------+ +| uint8 | uint8_t | ++-------------------------------------+-------------------+ +| uint16 | uint16_t | ++-------------------------------------+-------------------+ +| uint32 | uint32_t | ++-------------------------------------+-------------------+ +| uint64 | uint64_t | ++-------------------------------------+-------------------+ +| float32 | float | ++-------------------------------------+-------------------+ +| float64 | double | ++-------------------------------------+-------------------+ +| boolean | bool | ++-------------------------------------+-------------------+ +| date32 | int32_t | ++-------------------------------------+-------------------+ +| date64 | int64_t | ++-------------------------------------+-------------------+ +| timestamp | int64_t | ++-------------------------------------+-------------------+ +| time32 | int32_t | ++-------------------------------------+-------------------+ +| time64 | int64_t | ++-------------------------------------+-------------------+ +| interval_month | int32_t | ++-------------------------------------+-------------------+ +| interval_day_time | int64_t | ++-------------------------------------+-------------------+ +| utf8 (as parameter type) | const char*, | +| | uint32_t | +| | [see next section]| ++-------------------------------------+-------------------+ +| utf8 (as return type) | int64_t context, | +| | const char*, | +| | uint32_t* | +| | [see next section]| ++-------------------------------------+-------------------+ +| binary (as parameter type) | const char*, | +| | uint32_t | +| | [see next section]| ++-------------------------------------+-------------------+ +| utf8 (as return type) | int64_t context, | +| | const char*, | +| | uint32_t* | +| | [see next section]| ++-------------------------------------+-------------------+ + +Handling arrow::StringType (utf8 type) and arrow::BinaryType +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Both ``arrow::StringType`` and ``arrow::BinaryType`` are variable-length types. And they are handled similarly in external functions. Since ``arrow::StringType`` (utf8 type) is more commonly used, we will use it below as the example to explain how to handle variable-length types in external functions. + +Using ``arrow::StringType`` (also known as the ``utf8`` type) as function parameter or return value needs special handling in external functions. This section provides details on how to handle ``arrow::StringType``. + +**As a Parameter:** + +When ``arrow::StringType`` is used as a parameter type in a function signature, the corresponding C function should be defined to accept two parameters: + +* ``const char*``: This parameter serves as a pointer to the string data. +* ``uint32_t``: This parameter represents the length of the string data. + +**As a Return Type:** + +When ``arrow::StringType`` (``utf8`` type) is used as the return type in a function signature, several specific considerations apply: + +1. **NativeFunction Metadata Flag:** + * The ``NativeFunction`` metadata for this function must include the ``NativeFunction::kNeedsContext`` flag. This flag is critical for ensuring proper context management in the function. + +2. **Function Parameters:** + * **Context Parameter**: The C function should begin with an additional parameter, ``int64_t context``. This parameter is crucial for context management within the function. + * **String Length Output Parameter**: The function should also include a ``uint32_t*`` parameter at the end. This output parameter will store the length of the returned string data. +3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. +4. **Function Implementation:** + * **Memory Allocation and Error Messaging:** Within the function's implementation, use ``gdv_fn_context_arena_malloc`` and ``gdv_fn_context_set_error_msg`` for memory allocation and error messaging, respectively. Both functions take ``int64_t context`` as their first parameter, facilitating efficient context utilization. + +External C function registration APIs +------------------------------------- + +You can use ``gandiva::FunctionRegistry``'s APIs to register external C functions: + +.. code-block:: cpp + + /// \brief register a C function into the function registry + /// @param func the registered function's metadata + /// @param c_function_ptr the function pointer to the + /// registered function's implementation + /// @param function_holder_maker this will be used as the function holder if the + /// function requires a function holder + arrow::Status Register( + NativeFunction func, void* c_function_ptr, + std::optional function_holder_maker = std::nullopt); + +The above API allows you to register an external C function. + +* The ``NativeFunction`` object describes the metadata of the external C function. +* The ``c_function_ptr`` is the function pointer to the external C function's implementation. +* The optional ``function_holder_maker`` is used to create a function holder for the external C function if the external C function requires a function holder. Check out the ``gandiva::FunctionHolder`` class and its several sub-classes for more details. + +External IR functions +--------------------- + +IR function implementation +************************** + +Gandiva's support for IR (Intermediate Representation) functions provides the flexibility to implement these functions in various programming languages, depending on your specific needs. + +Examples and Tools for Compilation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. **Using C++ or C:** + + * If your IR functions are implemented in C++ or C, they can be compiled into LLVM bitcode, which is the intermediate representation understood by Gandiva. + * Compilation with Clang: For C++ implementations, you can utilize clang with the ``-emit-llvm`` option. This approach compiles your IR functions directly into LLVM bitcode, making them ready for integration with Gandiva. + +2. **Integrating with CMake:** + + * In projects where C++ is used alongside CMake, consider leveraging the ``GandivaAddBitcode.cmake`` module from the Arrow repository. This module can streamline the process of adding your custom bitcode to Gandiva. + +Consistency in Parameter and Return Types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is important to maintain consistency with the parameter and return types as established in C functions. Adhering to the rules discussed in the previous section ensures compatibility with Gandiva's type system. + +Registering External IR Functions in Gandiva +******************************************** + +1. **Post-Implementation and Compilation:** + + After successfully implementing and compiling your IR functions into LLVM bitcode, the next critical step is their registration within Gandiva. + +2. **Utilizing Gandiva's FunctionRegistry APIs:** + + Gandiva offers specific APIs within the ``gandiva::FunctionRegistry`` class to facilitate this registration process. + + **Registration APIs** + + * Registering from a Bitcode File: + + .. code-block:: cpp + + // Registers a set of functions from a specified bitcode file + arrow::Status Register(const std::vector& funcs, + const std::string& bitcode_path); + + * Registering from a Bitcode Buffer: + + .. code-block:: cpp + + // Registers a set of functions from a bitcode buffer + arrow::Status Register(const std::vector& funcs, + std::shared_ptr bitcode_buffer); + + **Key Points** + + * These APIs are designed to register a collection of external IR functions, either from a specified bitcode file or a preloaded bitcode buffer. + * It is essential to ensure that the bitcode file or buffer contains the correctly compiled IR functions. + * The ``NativeFunction`` instances play a crucial role in this process, serving to define the metadata for each of the external IR functions being registered. + +Conclusion +========== + +This guide provides an overview and detailed steps for integrating external functions into Gandiva. It covers both C and IR functions, and their registration in Gandiva. For more complex scenarios, refer to Gandiva's documentation and example implementations in source code. diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst index 33f075bd1818e..d67e0a7dec076 100644 --- a/docs/source/cpp/overview.rst +++ b/docs/source/cpp/overview.rst @@ -36,7 +36,7 @@ The one-dimensional layer ------------------------- **Data types** govern the *logical* interpretation of *physical* data. -Many operations in Arrow are parametered, at compile-time or at runtime, +Many operations in Arrow are parameterized, at compile-time or at runtime, by a data type. **Arrays** assemble one or several buffers with a data type, allowing to diff --git a/docs/source/cpp/tutorials/basic_arrow.rst b/docs/source/cpp/tutorials/basic_arrow.rst index 06f5fde32e818..409dfcc40d28f 100644 --- a/docs/source/cpp/tutorials/basic_arrow.rst +++ b/docs/source/cpp/tutorials/basic_arrow.rst @@ -241,7 +241,7 @@ Making a Table One particularly useful thing we can do with the :class:`ChunkedArrays ` from the previous section is creating :class:`Tables `. Much like a :class:`RecordBatch`, a :class:`Table` stores tabular data. However, a :class:`Table` does not guarantee contiguity, due to being made up of :class:`ChunkedArrays `. -This can be useful for logic, paralellizing work, for fitting chunks into cache, or exceeding the 2,147,483,647 row limit +This can be useful for logic, parallelizing work, for fitting chunks into cache, or exceeding the 2,147,483,647 row limit present in :class:`Array` and, thus, :class:`RecordBatch`. If you read up to :class:`RecordBatch`, you may note that the :class:`Table` constructor in the following code is diff --git a/docs/source/developers/continuous_integration/archery.rst b/docs/source/developers/continuous_integration/archery.rst index 4b9f1f300e408..d190a0a96cfcb 100644 --- a/docs/source/developers/continuous_integration/archery.rst +++ b/docs/source/developers/continuous_integration/archery.rst @@ -68,7 +68,7 @@ You can inspect Archery usage by passing the ``--help`` flag: linking Quick and dirty utilities for checking library linkage. lint Check Arrow source tree for errors numpydoc Lint python docstring with NumpyDoc - release Release releated commands. + release Release related commands. trigger-bot Archery exposes independent subcommands, each of which provides dedicated diff --git a/docs/source/developers/continuous_integration/crossbow.rst b/docs/source/developers/continuous_integration/crossbow.rst index 6308f077ac9a6..50ac607f4d87b 100644 --- a/docs/source/developers/continuous_integration/crossbow.rst +++ b/docs/source/developers/continuous_integration/crossbow.rst @@ -75,7 +75,7 @@ The following guide depends on GitHub, but theoretically any git server can be used. If you are not using the `ursacomputing/crossbow`_ -repository, you will need to complete the first two steps, otherwise procede +repository, you will need to complete the first two steps, otherwise proceed to step 3: 1. `Create the queue repository`_ @@ -245,7 +245,7 @@ see its help page: .. _Wheels: python-wheels .. _Linux packages: linux-packages .. _Create the queue repository: https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-new-repository -.. _Github Actions: https://docs.github.com/en/actions/quickstart +.. _GitHub Actions: https://docs.github.com/en/actions/quickstart .. _Travis CI: https://travis-ci.com/getting-started/ .. _Azure Pipelines: https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/pipelines-sign-up .. _auto cancellation: https://docs.travis-ci.com/user/customizing-the-build/#building-only-the-latest-commit diff --git a/docs/source/developers/continuous_integration/docker.rst b/docs/source/developers/continuous_integration/docker.rst index 49cbffe5a425c..68f3c7d709791 100644 --- a/docs/source/developers/continuous_integration/docker.rst +++ b/docs/source/developers/continuous_integration/docker.rst @@ -199,8 +199,8 @@ For detailed examples see the docker-compose.yml. Build Scripts ~~~~~~~~~~~~~ -The scripts maintainted under ci/scripts directory should be kept -parametrizable but reasonably minimal to clearly encapsulate the tasks it is +The scripts maintained under ci/scripts directory should be kept +parameterizable but reasonably minimal to clearly encapsulate the tasks it is responsible for. Like: - ``cpp_build.sh``: build the C++ implementation without running the tests. diff --git a/docs/source/developers/continuous_integration/overview.rst b/docs/source/developers/continuous_integration/overview.rst index 3e155bf6001e9..93e74f269d50a 100644 --- a/docs/source/developers/continuous_integration/overview.rst +++ b/docs/source/developers/continuous_integration/overview.rst @@ -34,7 +34,7 @@ One thing to note is that some of the services defined in ``docker-compose.yml`` There are numerous important directories in the Arrow project which relate to CI: -- ``.github/worflows`` - workflows that are run via GitHub actions and are triggered by things like pull requests being submitted or merged +- ``.github/workflows`` - workflows that are run via GitHub actions and are triggered by things like pull requests being submitted or merged - ``dev/tasks`` - containing extended jobs triggered/submitted via ``archery crossbow submit ...``, typically nightly builds or relating to the release process - ``ci/`` - containing scripts, dockerfiles, and any supplemental files, e.g. patch files, conda environment files, vcpkg triplet files. @@ -46,7 +46,7 @@ Instead of thinking about Arrow CI in terms of files and folders, it may be conc Action-triggered builds ----------------------- -The ``.yml`` files in ``.github/worflows`` are workflows which are run on GitHub in response to specific actions. The majority of workflows in this directory are Arrow implementation-specific and are run when changes are made which affect code relevant to that language's implementation, but other workflows worth noting are: +The ``.yml`` files in ``.github/workflows`` are workflows which are run on GitHub in response to specific actions. The majority of workflows in this directory are Arrow implementation-specific and are run when changes are made which affect code relevant to that language's implementation, but other workflows worth noting are: - ``archery.yml`` - if changes are made to the Archery tool or tasks which it runs, this workflow runs the necessary validation checks - ``comment_bot.yml`` - triggers certain actions by listening on github pull request comments for the following strings: diff --git a/docs/source/developers/cpp/fuzzing.rst b/docs/source/developers/cpp/fuzzing.rst index bd7b303d4a107..851d58fb5651c 100644 --- a/docs/source/developers/cpp/fuzzing.rst +++ b/docs/source/developers/cpp/fuzzing.rst @@ -36,9 +36,9 @@ areas ingesting potentially invalid or malicious data. Fuzz Targets and Utilities ========================== -By passing the ``-DARROW_FUZZING=ON`` CMake option, you will build -the fuzz targets corresponding to the aforementioned Arrow features, as well -as additional related utilities. +By passing the ``-DARROW_FUZZING=ON`` CMake option (or equivalently, using +the ``fuzzing`` preset), you will build the fuzz targets corresponding to +the aforementioned Arrow features, as well as additional related utilities. Generating the seed corpus -------------------------- @@ -85,11 +85,7 @@ various sanitizer checks enabled. .. code-block:: - $ cmake .. -GNinja \ - -DCMAKE_BUILD_TYPE=Debug \ - -DARROW_USE_ASAN=on \ - -DARROW_USE_UBSAN=on \ - -DARROW_FUZZING=on + $ cmake .. --preset=fuzzing Then, assuming you have downloaded the crashing data file (let's call it ``testcase-arrow-ipc-file-fuzz-123465``), you can reproduce the crash @@ -101,3 +97,15 @@ by running the affected fuzz target on that file: (you may want to run that command under a debugger so as to inspect the program state more closely) + +Using conda +----------- + +The fuzzing executables must be compiled with clang and linked to libraries +which provide a fuzzing runtime. If you are using conda to provide your +dependencies, you may need to install these before building the fuzz targets: + +.. code-block:: + + $ conda install clang clangxx compiler-rt + $ cmake .. --preset=fuzzing diff --git a/docs/source/developers/documentation.rst b/docs/source/developers/documentation.rst index fcd8e84c7ac4d..8b1ea28c0f54b 100644 --- a/docs/source/developers/documentation.rst +++ b/docs/source/developers/documentation.rst @@ -136,7 +136,7 @@ GitHub Actions response, where you need to click on the Crossbow build badge: .. figure:: ./images/docs_preview_1.jpeg :scale: 70 % - :alt: Github-actions response with the crossbow build status. + :alt: GitHub Actions response with the crossbow build status. Crossbow build status diff --git a/docs/source/developers/guide/documentation.rst b/docs/source/developers/guide/documentation.rst index 22e8e0eae438b..3bb3bebef5098 100644 --- a/docs/source/developers/guide/documentation.rst +++ b/docs/source/developers/guide/documentation.rst @@ -84,7 +84,7 @@ library. Source folder includes: - **C++ documentation** section: ``docs/source/cpp``. - **Development** section: ``docs/source/developers``. -- **Specificatons and protocols** section: ``docs/source/format``. +- **Specifications and protocols** section: ``docs/source/format``. - **Language documentation** **C (GLib), Java, JavaScript** and **Python** documentation is located diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst index f6e8db61e5148..f350f469af403 100644 --- a/docs/source/developers/guide/resources.rst +++ b/docs/source/developers/guide/resources.rst @@ -62,7 +62,7 @@ Additional information Other resources --------------- -Github +GitHub - `GitHub docs: Fork a repo `_ - `GitHub: Creating a pull request from a fork `_ diff --git a/docs/source/developers/guide/step_by_step/finding_issues.rst b/docs/source/developers/guide/step_by_step/finding_issues.rst index a3af1640a3b1d..390c56a81c73f 100644 --- a/docs/source/developers/guide/step_by_step/finding_issues.rst +++ b/docs/source/developers/guide/step_by_step/finding_issues.rst @@ -74,7 +74,7 @@ in the comments. Also, do not hesitate to ask questions in the comment. You can get some pointers about where to start and similar issues already solved. -**What if an issue is already asigned?** +**What if an issue is already assigned?** When in doubt, comment on the issue asking if they mind if you try to put together a pull request; interpret no response to mean that you’re free to proceed. diff --git a/docs/source/developers/guide/tutorials/r_tutorial.rst b/docs/source/developers/guide/tutorials/r_tutorial.rst index 5908064a7d492..62d5cfcbc76c2 100644 --- a/docs/source/developers/guide/tutorials/r_tutorial.rst +++ b/docs/source/developers/guide/tutorials/r_tutorial.rst @@ -86,7 +86,7 @@ link of the main repository to our upstream. Building R package ------------------ -The steps to follow for for building the R package differs depending on the operating +The steps to follow for building the R package differs depending on the operating system you are using. For this reason we will only refer to the instructions for the building process in this tutorial. diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index 8b2a504631fdb..0e831915e09b9 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -32,9 +32,12 @@ Arrow Java uses the `Maven `_ build system. Building requires: -* JDK 8, 9, 10, 11, 17, or 18, but only JDK 8, 11 and 17 are tested in CI. +* JDK 8+ * Maven 3+ +.. note:: + CI will test all supported JDK LTS versions, plus the latest non-LTS version. + Building ======== @@ -104,7 +107,7 @@ We can build these manually or we can use `Archery`_ to build them using a Docke Maven ~~~~~ -- To build only the JNI C Data Interface library (MacOS / Linux): +- To build only the JNI C Data Interface library (macOS / Linux): .. code-block:: text @@ -125,7 +128,7 @@ Maven $ dir "../java-dist/bin/x86_64" |__ arrow_cdata_jni.dll -- To build all JNI libraries (MacOS / Linux) except the JNI C Data Interface library: +- To build all JNI libraries (macOS / Linux) except the JNI C Data Interface library: .. code-block:: text @@ -150,7 +153,7 @@ Maven CMake ~~~~~ -- To build only the JNI C Data Interface library (MacOS / Linux): +- To build only the JNI C Data Interface library (macOS / Linux): .. code-block:: text @@ -189,7 +192,7 @@ CMake $ dir "java-dist/bin" |__ arrow_cdata_jni.dll -- To build all JNI libraries (MacOS / Linux) except the JNI C Data Interface library: +- To build all JNI libraries (macOS / Linux) except the JNI C Data Interface library: .. code-block:: @@ -390,7 +393,7 @@ Installing Nightly Packages These packages are not official releases. Use them at your own risk. Arrow nightly builds are posted on the mailing list at `builds@arrow.apache.org`_. -The artifacts are uploaded to GitHub. For example, for 2022/07/30, they can be found at `Github Nightly`_. +The artifacts are uploaded to GitHub. For example, for 2022/07/30, they can be found at `GitHub Nightly`_. Installing from Apache Nightlies @@ -426,7 +429,7 @@ Installing Manually ------------------- 1. Decide nightly packages repository to use, for example: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars -2. Add packages to your pom.xml, for example: flight-core (it depends on: arrow-format, arrow-vector, arrow-memeory-core and arrow-memory-netty). +2. Add packages to your pom.xml, for example: flight-core (it depends on: arrow-format, arrow-vector, arrow-memory-core and arrow-memory-netty). .. code-block:: xml @@ -537,4 +540,4 @@ Installing Manually 6. Compile your project like usual with ``mvn clean install``. .. _builds@arrow.apache.org: https://lists.apache.org/list.html?builds@arrow.apache.org -.. _Github Nightly: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars +.. _GitHub Nightly: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 6924c2d714e8b..0ff8e3a824ffc 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -183,7 +183,7 @@ Build source and binaries and submit them # Sign and upload the Java artifacts # - # Note that you need to press the "Close" button manually by Web interfacec + # Note that you need to press the "Close" button manually by Web interface # after you complete the script: # https://repository.apache.org/#stagingRepositories dev/release/06-java-upload.sh @@ -383,7 +383,7 @@ Be sure to go through on the following checklist: cd - # dev/release/post-12-msys2.sh 10.0.0 ../MINGW-packages - dev/release/post-12-msys2.sh X.Y.Z + dev/release/post-12-msys2.sh X.Y.Z This script pushes a ``arrow-X.Y.Z`` branch to your ``msys2/MINGW-packages`` fork. You need to create a pull request from the ``arrow-X.Y.Z`` branch with ``arrow: Update to X.Y.Z`` title on your Web browser. @@ -419,7 +419,7 @@ Be sure to go through on the following checklist: The package upload requires npm and yarn to be installed and 2FA to be configured on your account. - When you have access, you can publish releases to npm by running the the following script: + When you have access, you can publish releases to npm by running the following script: .. code-block:: Bash @@ -567,6 +567,9 @@ Be sure to go through on the following checklist: .. code-block:: Bash + # You can run the script with BUMP_TAG=0 and BUMP_PUSH=0 + # this will avoid default pushing to main and pushing the tag + # but you will require to push manually after reviewing the commits. # dev/release/post-11-bump-versions.sh 10.0.0 11.0.0 dev/release/post-11-bump-versions.sh X.Y.Z NEXT_X.NEXT_Y.NEXT_Z diff --git a/docs/source/developers/reviewing.rst b/docs/source/developers/reviewing.rst index 9a2e3dd7cc900..b6e0c1f4023bd 100644 --- a/docs/source/developers/reviewing.rst +++ b/docs/source/developers/reviewing.rst @@ -217,11 +217,11 @@ Social aspects * If you know someone who has the competence to help on a blocking issue and past experience suggests they may be willing to do so, feel free to - add them to the discussion (for example by gently pinging their Github + add them to the discussion (for example by gently pinging their GitHub handle). * If the contributor has stopped giving feedback or updating their PR, - perhaps they're not interested any more, but perhaps also they're stuck + perhaps they're not interested anymore, but perhaps also they're stuck on some issue and feel unable to push their contribution any further. Don't hesitate to ask (*"I see this PR hasn't seen any updates recently, are you stuck on something? Do you need any help?"*). diff --git a/docs/source/format/ADBC.rst b/docs/source/format/ADBC.rst index 0bd835e97db08..f90ab24d1b9c2 100644 --- a/docs/source/format/ADBC.rst +++ b/docs/source/format/ADBC.rst @@ -199,8 +199,8 @@ bypass this wrapper. implement the same protocol to try to reuse each other's work, e.g. several databases implement the Postgres wire protocol to benefit from its driver implementations. But the protocol itself - was not designed with multiple databases in mind, nor are they - generally meant to be used directly by applications. + was not designed with multiple databases in mind, nor are the + protocols generally meant to be used directly by applications. Some database-specific protocols are Arrow-native, like those of BigQuery and ClickHouse. Flight SQL additionally is meant to be diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e0884686acf6c..812212f536169 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -39,7 +39,7 @@ corresponding C FFI declarations. Applications and libraries can therefore work with Arrow memory without necessarily using Arrow libraries or reinventing the wheel. Developers can choose between tight integration -with the Arrow *software project* (benefitting from the growing array of +with the Arrow *software project* (benefiting from the growing array of facilities exposed by e.g. the C++ or Java implementations of Apache Arrow, but with the cost of a dependency) or minimal integration with the Arrow *format* only. @@ -140,10 +140,14 @@ strings: +-----------------+---------------------------------------------------+------------+ | ``Z`` | large binary | | +-----------------+---------------------------------------------------+------------+ +| ``vz`` | binary view | | ++-----------------+---------------------------------------------------+------------+ | ``u`` | utf-8 string | | +-----------------+---------------------------------------------------+------------+ | ``U`` | large utf-8 string | | +-----------------+---------------------------------------------------+------------+ +| ``vu`` | utf-8 view | | ++-----------------+---------------------------------------------------+------------+ | ``d:19,10`` | decimal128 [precision 19, scale 10] | | +-----------------+---------------------------------------------------+------------+ | ``d:19,10,NNN`` | decimal bitwidth = NNN [precision 19, scale 10] | | @@ -207,6 +211,10 @@ names and types of child fields are read from the child arrays. +------------------------+---------------------------------------------------+------------+ | ``+L`` | large list | | +------------------------+---------------------------------------------------+------------+ +| ``+vl`` | list-view | | ++------------------------+---------------------------------------------------+------------+ +| ``+vL`` | large list-view | | ++------------------------+---------------------------------------------------+------------+ | ``+w:123`` | fixed-sized list [123 items] | | +------------------------+---------------------------------------------------+------------+ | ``+s`` | struct | | @@ -243,6 +251,8 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. +* A ``large_list_view`` array has format string ``+Lv``, and its single + child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and ``f`` respectively. @@ -542,6 +552,14 @@ parameterized extension types). The ``ArrowArray`` structure exported from an extension array simply points to the storage data of the extension array. +Binary view arrays +------------------ + +For binary or utf-8 view arrays, an extra buffer is appended which stores +the lengths of each variadic data buffer as ``int64_t``. This buffer is +necessary since these buffer lengths are not trivially extractable from +other data in an array of binary or utf-8 view type. + .. _c-data-interface-semantics: Semantics diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst index a584852df87eb..76b7132681b02 100644 --- a/docs/source/format/CDeviceDataInterface.rst +++ b/docs/source/format/CDeviceDataInterface.rst @@ -61,7 +61,7 @@ Goals * Make it easy for third-party projects to implement support with little initial investment. * Allow zero-copy sharing of Arrow formatted device memory between - independant runtimes and components running in the same process. + independent runtimes and components running in the same process. * Avoid the need for one-to-one adaptation layers such as the `CUDA Array Interface`_ for Python processes to pass CUDA data. * Enable integration without explicit dependencies (either at compile-time @@ -445,7 +445,7 @@ could be used for any device: array->release = NULL; } - void export_int32_device_array(void* cudaAllocdPtr, + void export_int32_device_array(void* cudaAllocedPtr, cudaStream_t stream, int64_t length, struct ArrowDeviceArray* array) { @@ -492,7 +492,7 @@ could be used for any device: array->array.buffers = (const void**)malloc(sizeof(void*) * array->array.n_buffers); assert(array->array.buffers != NULL); array->array.buffers[0] = NULL; - array->array.buffers[1] = cudaAllocdPtr; + array->array.buffers[1] = cudaAllocedPtr; } // calling the release callback should be done using the array member @@ -629,7 +629,7 @@ Result lifetimes '''''''''''''''' The data returned by the ``get_schema`` and ``get_next`` callbacks must be -released independantly. Their lifetimes are not tied to that of +released independently. Their lifetimes are not tied to that of ``ArrowDeviceArrayStream``. Stream lifetime diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 084b6e62895fd..86cfab718dd3c 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -130,7 +130,7 @@ Fixed shape tensor ``{ "shape": [100, 200, 500], "permutation": [2, 0, 1]}`` - This is the physical layout shape and the the shape of the logical + This is the physical layout shape and the shape of the logical layout would in this case be ``[500, 100, 200]``. .. note:: diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 3f8cd946292ea..a6632fa2cf81b 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -715,7 +715,7 @@ A struct array has its own validity bitmap that is independent of its child arrays' validity bitmaps. The validity bitmap for the struct array might indicate a null when one or more of its child arrays has a non-null value in its corresponding slot; or conversely, a child -array might have a null in its validity bitmap while the struct array's +array might indicate a null in its validity bitmap while the struct array's validity bitmap shows a non-null value. Therefore, to know whether a particular child entry is valid, one must @@ -988,7 +988,7 @@ access is less efficient.) to the length of the array and this would be confusing. -A run must have have a length of at least 1. This means the values in the +A run must have a length of at least 1. This means the values in the run ends array all are positive and in strictly ascending order. A run end cannot be null. diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index bea1c8f5e51de..73ca848b5e996 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -103,7 +103,7 @@ A client that wishes to download the data would: different endpoints may be interleaved to allow parallel fetches. Note that since some clients may ignore ``FlightInfo.ordered``, if - ordering is important and client support can not be ensured, + ordering is important and client support cannot be ensured, servers should return a single endpoint. The response also contains other metadata, like the schema, and diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index f7521c3876493..add044c2d3621 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -120,6 +120,23 @@ the ``type`` should be ``ClosePreparedStatement``). ``ActionCreatePreparedStatementRequest`` Create a new prepared statement for a SQL query. + The response will contain an opaque handle used to identify the + prepared statement. It may also contain two optional schemas: the + Arrow schema of the result set, and the Arrow schema of the bind + parameters (if any). Because the schema of the result set may + depend on the bind parameters, the schemas may not necessarily be + provided here as a result, or if provided, they may not be accurate. + Clients should not assume the schema provided here will be the + schema of any data actually returned by executing the prepared + statement. + + Some statements may have bind parameters without any specific type. + (As a trivial example for SQL, consider ``SELECT ?``.) It is + not currently specified how this should be handled in the bind + parameter schema above. We suggest either using a union type to + enumerate the possible types, or using the NA (null) type as a + wildcard/placeholder. + ``CommandPreparedStatementQuery`` Execute a previously created prepared statement and get the results. @@ -128,6 +145,10 @@ the ``type`` should be ``ClosePreparedStatement``). When used with GetFlightInfo: execute the prepared statement. The prepared statement can be reused after fetching results. + When used with GetSchema: get the expected Arrow schema of the + result set. If the client has bound parameter values with DoPut + previously, the server should take those values into account. + ``CommandPreparedStatementUpdate`` Execute a previously created prepared statement that does not return results. diff --git a/docs/source/java/dataset.rst b/docs/source/java/dataset.rst index a4381e0814638..ec816052e76d1 100644 --- a/docs/source/java/dataset.rst +++ b/docs/source/java/dataset.rst @@ -149,7 +149,7 @@ ScanOptions: ScanOptions options = new ScanOptions(32768, Optional.empty()); -Or use shortcut construtor: +Or use shortcut constructor: .. code-block:: Java @@ -199,7 +199,7 @@ Native Memory Management ======================== To gain better performance and reduce code complexity, Java -``FileSystemDataset`` internally relys on C++ +``FileSystemDataset`` internally relies on C++ ``arrow::dataset::FileSystemDataset`` via JNI. As a result, all Arrow data read from ``FileSystemDataset`` is supposed to be allocated off the JVM heap. To manage this part of memory, an utility class diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index f29d4db3941fd..4ee364fcf636b 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -53,7 +53,7 @@ Cumulative Functions -------------------- Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identidy element +their input using a given binary associative operation with an identity element (a monoid) and output an array containing the corresponding intermediate running values. The input is expected to be of numeric type. By default these functions do not detect overflow. They are also diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 417a8a049c3d4..daab36f9a7be9 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -708,7 +708,7 @@ into memory: After the above example runs our data will be in dataset_root/1 and dataset_root/2 directories. In this simple example we are not changing the structure of the data -(only the directory naming schema) but you could also use this mechnaism to change +(only the directory naming schema) but you could also use this mechanism to change which columns are used to partition the dataset. This is useful when you expect to query your data in specific ways and you can utilize partitioning to reduce the amount of data you need to read. diff --git a/docs/source/python/getting_involved.rst b/docs/source/python/getting_involved.rst index 2271ad3cc02de..7b3bcf2ac527a 100644 --- a/docs/source/python/getting_involved.rst +++ b/docs/source/python/getting_involved.rst @@ -56,7 +56,7 @@ used as foundations to build easier to use entities. as is without modification. * The ``lib.pyx`` file is where the majority of the core C++ libarrow capabilities are exposed to Python. Most of the implementation of this - module relies on included ``*.pxi`` files where the specificic pieces + module relies on included ``*.pxi`` files where the specific pieces are built. While being exposed to Python as ``pyarrow.lib`` its content should be considered internal. The public classes are then directly exposed in other modules (like ``pyarrow`` itself) by virtue of importing them from diff --git a/docs/source/python/integration.rst b/docs/source/python/integration.rst index 997bc52102fd8..1cafc3dbded37 100644 --- a/docs/source/python/integration.rst +++ b/docs/source/python/integration.rst @@ -27,7 +27,7 @@ Developers can use Arrow to exchange data between various technologies and languages without incurring in any extra cost of marshalling/unmarshalling the data. The Arrow bindings and Arrow native libraries on the various platforms will all understand Arrow data -natively wihout the need to decode it. +natively without the need to decode it. This allows to easily integrate PyArrow with other languages and technologies. diff --git a/docs/source/python/integration/python_java.rst b/docs/source/python/integration/python_java.rst index 8b086485cf35f..0a242a4d393cc 100644 --- a/docs/source/python/integration/python_java.rst +++ b/docs/source/python/integration/python_java.rst @@ -246,7 +246,7 @@ We can use ``maven`` to collect all dependencies and make them available in a si Instead of manually collecting dependencies, you could also rely on the ``maven-assembly-plugin`` to build a single ``jar`` with all dependencies. -Once our package and all its depdendencies are available, +Once our package and all its dependencies are available, we can invoke it from ``fillten_pyarrowjvm.py`` script that will import the ``FillTen`` class and print out the result of invoking ``FillTen.createArray`` diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index e293699220c27..c354541a6779c 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -46,7 +46,7 @@ the consumer library can take and construct an object of it's own. .. code-block:: >>> import pyarrow as pa - >>> table = pa.table({"n_atendees": [100, 10, 1]}) + >>> table = pa.table({"n_attendees": [100, 10, 1]}) >>> table.__dataframe__() @@ -72,20 +72,20 @@ pyarrow table with the use of the interchange protocol: >>> import pandas as pd >>> df = pd.DataFrame({ - ... "n_atendees": [100, 10, 1], + ... "n_attendees": [100, 10, 1], ... "country": ["Italy", "Spain", "Slovenia"], ... }) >>> df - n_atendees country - 0 100 Italy - 1 10 Spain - 2 1 Slovenia + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia >>> from_dataframe(df) pyarrow.Table - n_atendees: int64 + n_attendees: int64 country: large_string ---- - n_atendees: [[100,10,1]] + n_attendees: [[100,10,1]] country: [["Italy","Spain","Slovenia"]] We can do the same with a polars dataframe: diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 76b06757c80da..23474b923718d 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -102,7 +102,7 @@ Let's allocate a resizable :class:`Buffer` from the default pool: pa.total_allocated_bytes() The default allocator requests memory in a minimum increment of 64 bytes. If -the buffer is garbaged-collected, all of the memory is freed: +the buffer is garbage-collected, all of the memory is freed: .. ipython:: python diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 24e6aa4fc0f9f..85a9674a689ca 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -428,7 +428,7 @@ metadata-only Parquet files. Note this is not a Parquet standard, but a convention set in practice by those frameworks. Using those files can give a more efficient creation of a parquet Dataset, -since it can use the stored schema and and file paths of all row groups, +since it can use the stored schema and file paths of all row groups, instead of inferring the schema and crawling the directories for all Parquet files (this is especially the case for filesystems where accessing files is expensive). diff --git a/docs/source/status.rst b/docs/source/status.rst index fee9a27b6ca1a..6167d3037ba77 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -83,6 +83,10 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| List View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Large List View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Struct | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Map | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 3c9a719f1275f..581cf1f76d57c 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -551,7 +551,7 @@ enum SqlInfo { // Retrieves a int64 value representing the maximum number of characters allowed for a column name. SQL_MAX_COLUMN_NAME_LENGTH = 543; - // Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. + // Retrieves a int64 value representing the maximum number of columns allowed in a GROUP BY clause. SQL_MAX_COLUMNS_IN_GROUP_BY = 544; // Retrieves a int64 value representing the maximum number of columns allowed in an index. @@ -943,7 +943,7 @@ enum SqlSupportsConvert { /** * The JDBC/ODBC-defined type of any object. - * All the values here are the sames as in the JDBC and ODBC specs. + * All the values here are the same as in the JDBC and ODBC specs. */ enum XdbcDataType { XDBC_UNKNOWN_TYPE = 0; @@ -1023,14 +1023,14 @@ enum Nullable { NULLABILITY_NULLABLE = 1; /** - * Indicates that nullability of the fields can not be determined. + * Indicates that nullability of the fields cannot be determined. */ NULLABILITY_UNKNOWN = 2; } enum Searchable { /** - * Indicates that column can not be used in a WHERE clause. + * Indicates that column cannot be used in a WHERE clause. */ SEARCHABLE_NONE = 0; @@ -1196,7 +1196,7 @@ message CommandGetDbSchemas { * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. @@ -1537,11 +1537,14 @@ message ActionCreatePreparedStatementResult { bytes prepared_statement_handle = 1; // If a result set generating query was provided, dataset_schema contains the - // schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. + // schema of the result set. It should be an IPC-encapsulated Schema, as described in Schema.fbs. + // For some queries, the schema of the results may depend on the schema of the parameters. The server + // should provide its best guess as to the schema at this point. Clients must not assume that this + // schema, if provided, will be accurate. bytes dataset_schema = 2; // If the query provided contained parameters, parameter_schema contains the - // schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. + // schema of the expected parameters. It should be an IPC-encapsulated Schema, as described in Schema.fbs. bytes parameter_schema = 3; } @@ -1676,7 +1679,7 @@ message ActionEndSavepointRequest { * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. * - GetFlightInfo: execute the query. @@ -1702,7 +1705,7 @@ message CommandStatementQuery { * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. * - GetFlightInfo: execute the query. @@ -1740,9 +1743,12 @@ message TicketStatementQuery { * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * + * If the schema is retrieved after parameter values have been bound with DoPut, then the server should account + * for the parameters when determining the schema. * - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. * - GetFlightInfo: execute the prepared statement instance. */ @@ -1755,7 +1761,7 @@ message CommandPreparedStatementQuery { /* * Represents a SQL update query. Used in the command member of FlightDescriptor - * for the the RPC call DoPut to cause the server to execute the included SQL update. + * for the RPC call DoPut to cause the server to execute the included SQL update. */ message CommandStatementUpdate { option (experimental) = true; @@ -1768,7 +1774,7 @@ message CommandStatementUpdate { /* * Represents a SQL update query. Used in the command member of FlightDescriptor - * for the the RPC call DoPut to cause the server to execute the included + * for the RPC call DoPut to cause the server to execute the included * prepared statement handle as an update. */ message CommandPreparedStatementUpdate { diff --git a/format/Schema.fbs b/format/Schema.fbs index dbf482e6cc786..a03ca31ae97c4 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -372,7 +372,7 @@ table Time { /// no indication of how to map this information to a physical point in time. /// Naive date-times must be handled with care because of this missing /// information, and also because daylight saving time (DST) may make -/// some values ambiguous or non-existent. A naive date-time may be +/// some values ambiguous or nonexistent. A naive date-time may be /// stored as a struct with Date and Time fields. However, it may also be /// encoded into a Timestamp column with an empty timezone. The timestamp /// values should be computed "as if" the timezone of the date-time values diff --git a/format/adbc.h b/format/adbc.h index a1ff53441db28..14281e3a4c89a 100644 --- a/format/adbc.h +++ b/format/adbc.h @@ -475,7 +475,7 @@ struct ADBC_EXPORT AdbcError { /// Must be kept alive as long as any connections exist. struct ADBC_EXPORT AdbcDatabase { /// \brief Opaque implementation-defined state. - /// This field is NULLPTR iff the connection is unintialized/freed. + /// This field is NULLPTR iff the connection is uninitialized/freed. void* private_data; /// \brief The associated driver (used by the driver manager to help /// track state). @@ -498,7 +498,7 @@ struct ADBC_EXPORT AdbcDatabase { /// serialize accesses to a connection. struct ADBC_EXPORT AdbcConnection { /// \brief Opaque implementation-defined state. - /// This field is NULLPTR iff the connection is unintialized/freed. + /// This field is NULLPTR iff the connection is uninitialized/freed. void* private_data; /// \brief The associated driver (used by the driver manager to help /// track state). @@ -536,7 +536,7 @@ struct ADBC_EXPORT AdbcConnection { /// serialize accesses to a statement. struct ADBC_EXPORT AdbcStatement { /// \brief Opaque implementation-defined state. - /// This field is NULLPTR iff the connection is unintialized/freed. + /// This field is NULLPTR iff the connection is uninitialized/freed. void* private_data; /// \brief The associated driver (used by the driver manager to help @@ -575,7 +575,7 @@ struct AdbcPartitions { const size_t* partition_lengths; /// \brief Opaque implementation-defined state. - /// This field is NULLPTR iff the connection is unintialized/freed. + /// This field is NULLPTR iff the connection is uninitialized/freed. void* private_data; /// \brief Release the contained partitions. @@ -603,11 +603,11 @@ struct AdbcPartitions { /// worrying about multiple definitions of the same symbol. struct ADBC_EXPORT AdbcDriver { /// \brief Opaque driver-defined state. - /// This field is NULL if the driver is unintialized/freed (but + /// This field is NULL if the driver is uninitialized/freed (but /// it need not have a value even if the driver is initialized). void* private_data; /// \brief Opaque driver manager-defined state. - /// This field is NULL if the driver is unintialized/freed (but + /// This field is NULL if the driver is uninitialized/freed (but /// it need not have a value even if the driver is initialized). void* private_manager; diff --git a/go/arrow/array.go b/go/arrow/array.go index e07fa478aae57..eed859cf46649 100644 --- a/go/arrow/array.go +++ b/go/arrow/array.go @@ -81,6 +81,8 @@ type ArrayData interface { // Dictionary returns the ArrayData object for the dictionary if this is a // dictionary array, otherwise it will be nil. Dictionary() ArrayData + // SizeInBytes returns the size of the ArrayData buffers and any children and/or dictionary in bytes. + SizeInBytes() uint64 } // Array represents an immutable sequence of values using the Arrow in-memory format. diff --git a/go/arrow/array/binary_test.go b/go/arrow/array/binary_test.go index c9e165515225b..8a11ec7dab8c4 100644 --- a/go/arrow/array/binary_test.go +++ b/go/arrow/array/binary_test.go @@ -705,7 +705,7 @@ func TestBinaryViewStringRoundTrip(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - values := []string{"a", "bc", "", "", "supercalifragilistic", "", "expeallodocious"} + values := []string{"a", "bc", "", "", "supercalifragilistic", "", "expialidocious"} valid := []bool{true, true, false, false, true, true, true} b := NewBinaryViewBuilder(mem) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index fa3554c1c0555..f0bc2855eb1e4 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -695,7 +695,7 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, } out.childData = []arrow.ArrayData{children} case *arrow.StructType: - out.childData = make([]arrow.ArrayData, len(dt.Fields())) + out.childData = make([]arrow.ArrayData, dt.NumFields()) for i := range dt.Fields() { children := gatherChildren(data, i) for _, c := range children { diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index 7b22d97a41e00..a3686b45700aa 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -694,7 +694,7 @@ func TestConcatAlmostOverflowRunEndEncoding(t *testing.T) { defer bldr.Release() valBldr := bldr.ValueBuilder().(*array.StringBuilder) - // max is not evently divisible by 4, so we add one to each + // max is not evenly divisible by 4, so we add one to each // to account for that so our final concatenate will overflow bldr.Append((tt.max / 4) + 1) valBldr.Append("foo") @@ -741,7 +741,7 @@ func TestConcatOverflowRunEndEncoding(t *testing.T) { defer bldr.Release() valBldr := bldr.ValueBuilder().(*array.StringBuilder) - // max is not evently divisible by 4, so we add one to each + // max is not evenly divisible by 4, so we add one to each // to account for that so our final concatenate will overflow bldr.Append((tt.max / 4) + 1) valBldr.Append("foo") diff --git a/go/arrow/array/data.go b/go/arrow/array/data.go index 8cce49182b879..ddd9cf0c895d5 100644 --- a/go/arrow/array/data.go +++ b/go/arrow/array/data.go @@ -190,9 +190,34 @@ func (d *Data) SetDictionary(dict arrow.ArrayData) { } } +// SizeInBytes returns the size of the Data and any children and/or dictionary in bytes by +// recursively examining the nested structures of children and/or dictionary. +// The value returned is an upper-bound since offset is not taken into account. +func (d *Data) SizeInBytes() uint64 { + var size uint64 + + if d == nil { + return 0 + } + + for _, b := range d.Buffers() { + size += uint64(b.Len()) + } + for _, c := range d.Children() { + size += c.SizeInBytes() + } + if d.dictionary != nil { + size += d.dictionary.SizeInBytes() + } + + return size +} + // NewSliceData returns a new slice that shares backing data with the input. // The returned Data slice starts at i and extends j-i elements, such as: -// slice := data[i:j] +// +// slice := data[i:j] +// // The returned value must be Release'd after use. // // NewSliceData panics if the slice is outside the valid range of the input Data. diff --git a/go/arrow/array/data_test.go b/go/arrow/array/data_test.go index b7b0f396470d7..dd4793a7cdbfa 100644 --- a/go/arrow/array/data_test.go +++ b/go/arrow/array/data_test.go @@ -49,3 +49,78 @@ func TestDataReset(t *testing.T) { data.Reset(&arrow.Int64Type{}, 5, data.Buffers(), nil, 1, 2) } } + +func TestSizeInBytes(t *testing.T) { + var buffers1 = make([]*memory.Buffer, 0, 3) + + for i := 0; i < cap(buffers1); i++ { + buffers1 = append(buffers1, memory.NewBufferBytes([]byte("15-bytes-buffer"))) + } + data := NewData(&arrow.StringType{}, 10, buffers1, nil, 0, 0) + var arrayData arrow.ArrayData = data + dataWithChild := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{arrayData}, 0, 0) + + t.Run("buffers only", func(t *testing.T) { + expectedSize := uint64(45) + if actualSize := data.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and child data", func(t *testing.T) { + // 45 bytes in buffers, 45 bytes in child data + expectedSize := uint64(90) + if actualSize := dataWithChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and nested child data", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + var dataWithNestedChild arrow.ArrayData = NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{dataWithChildArrayData}, 0, 0) + // 45 bytes in buffers, 90 bytes in nested child data + expectedSize := uint64(135) + if actualSize := dataWithNestedChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and dictionary", func(t *testing.T) { + dictData := data + dataWithDict := NewDataWithDictionary(&arrow.StringType{}, 10, buffers1, 0, 0, dictData) + // 45 bytes in buffers, 45 bytes in dictionary + expectedSize := uint64(90) + if actualSize := dataWithDict.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(45) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data with children", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + sliceData := NewSliceData(dataWithChildArrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers with children which are sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + dataWithSlicedChildren := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{sliceData}, 0, 0) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := dataWithSlicedChildren.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) +} diff --git a/go/arrow/array/dictionary.go b/go/arrow/array/dictionary.go index 856f91605ff53..125c02391f340 100644 --- a/go/arrow/array/dictionary.go +++ b/go/arrow/array/dictionary.go @@ -739,7 +739,7 @@ func (b *dictionaryBuilder) UnmarshalJSON(data []byte) error { } if delim, ok := t.(json.Delim); !ok || delim != '[' { - return fmt.Errorf("dictionary builder must upack from json array, found %s", delim) + return fmt.Errorf("dictionary builder must unpack from json array, found %s", delim) } return b.Unmarshal(dec) @@ -1533,7 +1533,7 @@ type DictionaryUnifier interface { // values, an error will be returned instead. The new unified dictionary // is returned. GetResultWithIndexType(indexType arrow.DataType) (arrow.Array, error) - // Release should be called to clean up any allocated scrach memo-table used + // Release should be called to clean up any allocated scratch memo-table used // for building the unified dictionary. Release() } diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index f10e2072c43a2..4b62734116797 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -926,7 +926,7 @@ func (a *LargeListView) Release() { a.values.Release() } -// Acessors for offsets and sizes to make ListView and LargeListView validation generic. +// Accessors for offsets and sizes to make ListView and LargeListView validation generic. type offsetsAndSizes interface { offsetAt(slot int64) int64 sizeAt(slot int64) int64 diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index 11404b2d8bb95..011b5d7426b22 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -541,7 +541,7 @@ func TestListArraySlice(t *testing.T) { } } -func TestLisViewtArraySlice(t *testing.T) { +func TestListViewArraySlice(t *testing.T) { tests := []struct { typeID arrow.Type offsets interface{} diff --git a/go/arrow/array/record.go b/go/arrow/array/record.go index d080f726e472d..f25e7c9a874b3 100644 --- a/go/arrow/array/record.go +++ b/go/arrow/array/record.go @@ -185,7 +185,7 @@ func (rec *simpleRecord) validate() error { return nil } - if len(rec.arrs) != len(rec.schema.Fields()) { + if len(rec.arrs) != rec.schema.NumFields() { return fmt.Errorf("arrow/array: number of columns/fields mismatch") } @@ -285,11 +285,11 @@ func NewRecordBuilder(mem memory.Allocator, schema *arrow.Schema) *RecordBuilder refCount: 1, mem: mem, schema: schema, - fields: make([]Builder, len(schema.Fields())), + fields: make([]Builder, schema.NumFields()), } - for i, f := range schema.Fields() { - b.fields[i] = NewBuilder(b.mem, f.Type) + for i := 0; i < schema.NumFields(); i++ { + b.fields[i] = NewBuilder(b.mem, schema.Field(i).Type) } return b @@ -397,8 +397,8 @@ func (b *RecordBuilder) UnmarshalJSON(data []byte) error { } } - for i, f := range b.schema.Fields() { - if !keylist[f.Name] { + for i := 0; i < b.schema.NumFields(); i++ { + if !keylist[b.schema.Field(i).Name] { b.fields[i].AppendNull() } } diff --git a/go/arrow/array/string_test.go b/go/arrow/array/string_test.go index 803fae51347c1..3df56a2675252 100644 --- a/go/arrow/array/string_test.go +++ b/go/arrow/array/string_test.go @@ -189,7 +189,7 @@ func TestStringBuilder_Empty(t *testing.T) { } // TestStringReset tests the Reset() method on the String type by creating two different Strings and then -// reseting the contents of string2 with the values from string1. +// resetting the contents of string2 with the values from string1. func TestStringReset(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) sb1 := array.NewStringBuilder(mem) @@ -471,7 +471,7 @@ func TestLargeStringBuilder_Empty(t *testing.T) { } // TestStringReset tests the Reset() method on the String type by creating two different Strings and then -// reseting the contents of string2 with the values from string1. +// resetting the contents of string2 with the values from string1. func TestLargeStringReset(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) sb1 := array.NewLargeStringBuilder(mem) @@ -771,7 +771,7 @@ func TestStringViewBuilder_Empty(t *testing.T) { } // TestStringReset tests the Reset() method on the String type by creating two different Strings and then -// reseting the contents of string2 with the values from string1. +// resetting the contents of string2 with the values from string1. func TestStringViewReset(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) sb1 := array.NewStringViewBuilder(mem) diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go index 94052953852c2..5e723c9dc7553 100644 --- a/go/arrow/array/struct.go +++ b/go/arrow/array/struct.go @@ -224,7 +224,7 @@ func NewStructBuilder(mem memory.Allocator, dtype *arrow.StructType) *StructBuil b := &StructBuilder{ builder: builder{refCount: 1, mem: mem}, dtype: dtype, - fields: make([]Builder, len(dtype.Fields())), + fields: make([]Builder, dtype.NumFields()), } for i, f := range dtype.Fields() { b.fields[i] = NewBuilder(b.mem, f.Type) diff --git a/go/arrow/array/table.go b/go/arrow/array/table.go index a987dd057f82c..b34650e182de4 100644 --- a/go/arrow/array/table.go +++ b/go/arrow/array/table.go @@ -137,11 +137,11 @@ func NewTable(schema *arrow.Schema, cols []arrow.Column, rows int64) *simpleTabl // - the total length of each column's array slice (ie: number of rows // in the column) aren't the same for all columns. func NewTableFromSlice(schema *arrow.Schema, data [][]arrow.Array) *simpleTable { - if len(data) != len(schema.Fields()) { + if len(data) != schema.NumFields() { panic("array/table: mismatch in number of columns and data for creating a table") } - cols := make([]arrow.Column, len(schema.Fields())) + cols := make([]arrow.Column, schema.NumFields()) for i, arrs := range data { field := schema.Field(i) chunked := arrow.NewChunked(field.Type, arrs) @@ -177,7 +177,7 @@ func NewTableFromSlice(schema *arrow.Schema, data [][]arrow.Array) *simpleTable // NewTableFromRecords panics if the records and schema are inconsistent. func NewTableFromRecords(schema *arrow.Schema, recs []arrow.Record) *simpleTable { arrs := make([]arrow.Array, len(recs)) - cols := make([]arrow.Column, len(schema.Fields())) + cols := make([]arrow.Column, schema.NumFields()) defer func(cols []arrow.Column) { for i := range cols { @@ -224,7 +224,7 @@ func (tbl *simpleTable) NumCols() int64 { return int64(len(tbl.cols) func (tbl *simpleTable) Column(i int) *arrow.Column { return &tbl.cols[i] } func (tbl *simpleTable) validate() { - if len(tbl.cols) != len(tbl.schema.Fields()) { + if len(tbl.cols) != tbl.schema.NumFields() { panic(errors.New("arrow/array: table schema mismatch")) } for i, col := range tbl.cols { diff --git a/go/arrow/array/union.go b/go/arrow/array/union.go index c0a5050560634..1af3e70472065 100644 --- a/go/arrow/array/union.go +++ b/go/arrow/array/union.go @@ -69,7 +69,7 @@ type Union interface { // or arrow.DenseMode. Mode() arrow.UnionMode // Field returns the requested child array for this union. Returns nil if a - // non-existent position is passed in. + // nonexistent position is passed in. // // The appropriate child for an index can be retrieved with Field(ChildID(index)) Field(pos int) arrow.Array @@ -896,7 +896,7 @@ func NewEmptySparseUnionBuilder(mem memory.Allocator) *SparseUnionBuilder { // children and type codes. Builders will be constructed for each child // using the fields in typ func NewSparseUnionBuilder(mem memory.Allocator, typ *arrow.SparseUnionType) *SparseUnionBuilder { - children := make([]Builder, len(typ.Fields())) + children := make([]Builder, typ.NumFields()) for i, f := range typ.Fields() { children[i] = NewBuilder(mem, f.Type) defer children[i].Release() @@ -980,7 +980,7 @@ func (b *SparseUnionBuilder) AppendEmptyValues(n int) { // // After appending to the corresponding child builder, all other child // builders should have a null or empty value appended to them (although -// this is not enfoced and any value is theoretically allowed and will be +// this is not enforced and any value is theoretically allowed and will be // ignored). func (b *SparseUnionBuilder) Append(nextType arrow.UnionTypeCode) { b.typesBuilder.AppendValue(nextType) @@ -1129,7 +1129,7 @@ func NewEmptyDenseUnionBuilder(mem memory.Allocator) *DenseUnionBuilder { // children and type codes. Builders will be constructed for each child // using the fields in typ func NewDenseUnionBuilder(mem memory.Allocator, typ *arrow.DenseUnionType) *DenseUnionBuilder { - children := make([]Builder, 0, len(typ.Fields())) + children := make([]Builder, 0, typ.NumFields()) defer func() { for _, child := range children { child.Release() diff --git a/go/arrow/array/util.go b/go/arrow/array/util.go index a1b3cc7d4e5f7..c9b730b040611 100644 --- a/go/arrow/array/util.go +++ b/go/arrow/array/util.go @@ -428,7 +428,7 @@ func (n *nullArrayFactory) create() *Data { } if nf, ok := dt.(arrow.NestedType); ok { - childData = make([]arrow.ArrayData, len(nf.Fields())) + childData = make([]arrow.ArrayData, nf.NumFields()) } switch dt := dt.(type) { diff --git a/go/arrow/avro/avro2parquet/main.go b/go/arrow/avro/avro2parquet/main.go new file mode 100644 index 0000000000000..45377b46a444c --- /dev/null +++ b/go/arrow/avro/avro2parquet/main.go @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "log" + "os" + "runtime/pprof" + "time" + + "github.com/apache/arrow/go/v15/arrow/avro" + "github.com/apache/arrow/go/v15/parquet" + "github.com/apache/arrow/go/v15/parquet/compress" + pq "github.com/apache/arrow/go/v15/parquet/pqarrow" +) + +var ( + cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`") + filepath = flag.String("file", "", "avro ocf to convert") +) + +func main() { + flag.Parse() + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal("could not create CPU profile: ", err) + } + defer f.Close() // error handling omitted for example + if err := pprof.StartCPUProfile(f); err != nil { + log.Fatal("could not start CPU profile: ", err) + } + defer pprof.StopCPUProfile() + } + if *filepath == "" { + fmt.Println("no file specified") + } + chunk := 1024 * 8 + ts := time.Now() + log.Println("starting:") + info, err := os.Stat(*filepath) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + filesize := info.Size() + data, err := os.ReadFile(*filepath) + if err != nil { + fmt.Println(err) + os.Exit(2) + } + fmt.Printf("file : %v\nsize: %v MB\n", filepath, float64(filesize)/1024/1024) + + r := bytes.NewReader(data) + ior := bufio.NewReaderSize(r, 4096*8) + av2arReader, err := avro.NewOCFReader(ior, avro.WithChunk(chunk)) + if err != nil { + fmt.Println(err) + os.Exit(3) + } + fp, err := os.OpenFile(*filepath+".parquet", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644) + if err != nil { + fmt.Println(err) + os.Exit(4) + } + defer fp.Close() + pwProperties := parquet.NewWriterProperties(parquet.WithDictionaryDefault(true), + parquet.WithVersion(parquet.V2_LATEST), + parquet.WithCompression(compress.Codecs.Snappy), + parquet.WithBatchSize(1024*32), + parquet.WithDataPageSize(1024*1024), + parquet.WithMaxRowGroupLength(64*1024*1024), + ) + awProperties := pq.NewArrowWriterProperties(pq.WithStoreSchema()) + pr, err := pq.NewFileWriter(av2arReader.Schema(), fp, pwProperties, awProperties) + if err != nil { + fmt.Println(err) + os.Exit(5) + } + defer pr.Close() + fmt.Printf("parquet version: %v\n", pwProperties.Version()) + for av2arReader.Next() { + if av2arReader.Err() != nil { + fmt.Println(err) + os.Exit(6) + } + recs := av2arReader.Record() + err = pr.WriteBuffered(recs) + if err != nil { + fmt.Println(err) + os.Exit(7) + } + recs.Release() + } + if av2arReader.Err() != nil { + fmt.Println(av2arReader.Err()) + } + + pr.Close() + log.Printf("time to convert: %v\n", time.Since(ts)) +} diff --git a/go/arrow/avro/loader.go b/go/arrow/avro/loader.go new file mode 100644 index 0000000000000..26d8678e8e2be --- /dev/null +++ b/go/arrow/avro/loader.go @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package avro + +import ( + "errors" + "fmt" + "io" +) + +func (r *OCFReader) decodeOCFToChan() { + defer close(r.avroChan) + for r.r.HasNext() { + select { + case <-r.readerCtx.Done(): + r.err = fmt.Errorf("avro decoding cancelled, %d records read", r.avroDatumCount) + return + default: + var datum any + err := r.r.Decode(&datum) + if err != nil { + if errors.Is(err, io.EOF) { + r.err = nil + return + } + r.err = err + return + } + r.avroChan <- datum + r.avroDatumCount++ + } + } +} + +func (r *OCFReader) recordFactory() { + defer close(r.recChan) + r.primed = true + recChunk := 0 + switch { + case r.chunk < 1: + for data := range r.avroChan { + err := r.ldr.loadDatum(data) + if err != nil { + r.err = err + return + } + } + r.recChan <- r.bld.NewRecord() + r.bldDone <- struct{}{} + case r.chunk >= 1: + for data := range r.avroChan { + if recChunk == 0 { + r.bld.Reserve(r.chunk) + } + err := r.ldr.loadDatum(data) + if err != nil { + r.err = err + return + } + recChunk++ + if recChunk >= r.chunk { + r.recChan <- r.bld.NewRecord() + recChunk = 0 + } + } + if recChunk != 0 { + r.recChan <- r.bld.NewRecord() + } + r.bldDone <- struct{}{} + } +} diff --git a/go/arrow/avro/reader.go b/go/arrow/avro/reader.go new file mode 100644 index 0000000000000..e72a5632bdd6e --- /dev/null +++ b/go/arrow/avro/reader.go @@ -0,0 +1,337 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package avro + +import ( + "context" + "errors" + "fmt" + "io" + "sync/atomic" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/array" + "github.com/apache/arrow/go/v15/arrow/internal/debug" + "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/hamba/avro/v2/ocf" + "github.com/tidwall/sjson" + + avro "github.com/hamba/avro/v2" +) + +var ErrMismatchFields = errors.New("arrow/avro: number of records mismatch") + +// Option configures an Avro reader/writer. +type ( + Option func(config) + config *OCFReader +) + +type schemaEdit struct { + method string + path string + value any +} + +// Reader wraps goavro/OCFReader and creates array.Records from a schema. +type OCFReader struct { + r *ocf.Decoder + avroSchema string + avroSchemaEdits []schemaEdit + schema *arrow.Schema + + refs int64 + bld *array.RecordBuilder + bldMap *fieldPos + ldr *dataLoader + cur arrow.Record + err error + + primed bool + readerCtx context.Context + readCancel func() + maxOCF int + maxRec int + + avroChan chan any + avroDatumCount int64 + avroChanSize int + recChan chan arrow.Record + + bldDone chan struct{} + + recChanSize int + chunk int + mem memory.Allocator +} + +// NewReader returns a reader that reads from an Avro OCF file and creates +// arrow.Records from the converted avro data. +func NewOCFReader(r io.Reader, opts ...Option) (*OCFReader, error) { + ocfr, err := ocf.NewDecoder(r) + if err != nil { + return nil, fmt.Errorf("%w: could not create avro ocfreader", arrow.ErrInvalid) + } + + rr := &OCFReader{ + r: ocfr, + refs: 1, + chunk: 1, + avroChanSize: 500, + recChanSize: 10, + } + for _, opt := range opts { + opt(rr) + } + + rr.avroChan = make(chan any, rr.avroChanSize) + rr.recChan = make(chan arrow.Record, rr.recChanSize) + rr.bldDone = make(chan struct{}) + schema, err := avro.Parse(string(ocfr.Metadata()["avro.schema"])) + if err != nil { + return nil, fmt.Errorf("%w: could not parse avro header", arrow.ErrInvalid) + } + rr.avroSchema = schema.String() + if len(rr.avroSchemaEdits) > 0 { + // execute schema edits + for _, e := range rr.avroSchemaEdits { + err := rr.editAvroSchema(e) + if err != nil { + return nil, fmt.Errorf("%w: could not edit avro schema", arrow.ErrInvalid) + } + } + // validate edited schema + schema, err = avro.Parse(rr.avroSchema) + if err != nil { + return nil, fmt.Errorf("%w: could not parse modified avro schema", arrow.ErrInvalid) + } + } + rr.schema, err = ArrowSchemaFromAvro(schema) + if err != nil { + return nil, fmt.Errorf("%w: could not convert avro schema", arrow.ErrInvalid) + } + if rr.mem == nil { + rr.mem = memory.DefaultAllocator + } + rr.readerCtx, rr.readCancel = context.WithCancel(context.Background()) + go rr.decodeOCFToChan() + + rr.bld = array.NewRecordBuilder(rr.mem, rr.schema) + rr.bldMap = newFieldPos() + rr.ldr = newDataLoader() + for idx, fb := range rr.bld.Fields() { + mapFieldBuilders(fb, rr.schema.Field(idx), rr.bldMap) + } + rr.ldr.drawTree(rr.bldMap) + go rr.recordFactory() + return rr, nil +} + +// Reuse allows the OCFReader to be reused to read another Avro file provided the +// new Avro file has an identical schema. +func (rr *OCFReader) Reuse(r io.Reader, opts ...Option) error { + rr.Close() + rr.err = nil + ocfr, err := ocf.NewDecoder(r) + if err != nil { + return fmt.Errorf("%w: could not create avro ocfreader", arrow.ErrInvalid) + } + schema, err := avro.Parse(string(ocfr.Metadata()["avro.schema"])) + if err != nil { + return fmt.Errorf("%w: could not parse avro header", arrow.ErrInvalid) + } + if rr.avroSchema != schema.String() { + return fmt.Errorf("%w: avro schema mismatch", arrow.ErrInvalid) + } + + rr.r = ocfr + for _, opt := range opts { + opt(rr) + } + + rr.maxOCF = 0 + rr.maxRec = 0 + rr.avroDatumCount = 0 + rr.primed = false + + rr.avroChan = make(chan any, rr.avroChanSize) + rr.recChan = make(chan arrow.Record, rr.recChanSize) + rr.bldDone = make(chan struct{}) + + rr.readerCtx, rr.readCancel = context.WithCancel(context.Background()) + go rr.decodeOCFToChan() + go rr.recordFactory() + return nil +} + +// Err returns the last error encountered during the iteration over the +// underlying Avro file. +func (r *OCFReader) Err() error { return r.err } + +// AvroSchema returns the Avro schema of the Avro OCF +func (r *OCFReader) AvroSchema() string { return r.avroSchema } + +// Schema returns the converted Arrow schema of the Avro OCF +func (r *OCFReader) Schema() *arrow.Schema { return r.schema } + +// Record returns the current record that has been extracted from the +// underlying Avro OCF file. +// It is valid until the next call to Next. +func (r *OCFReader) Record() arrow.Record { return r.cur } + +// Metrics returns the maximum queue depth of the Avro record read cache and of the +// converted Arrow record cache. +func (r *OCFReader) Metrics() string { + return fmt.Sprintf("Max. OCF queue depth: %d/%d Max. record queue depth: %d/%d", r.maxOCF, r.avroChanSize, r.maxRec, r.recChanSize) +} + +// OCFRecordsReadCount returns the number of Avro datum that were read from the Avro file. +func (r *OCFReader) OCFRecordsReadCount() int64 { return r.avroDatumCount } + +// Close closes the OCFReader's Avro record read cache and converted Arrow record cache. OCFReader must +// be closed if the Avro OCF's records have not been read to completion. +func (r *OCFReader) Close() { + r.readCancel() + r.err = r.readerCtx.Err() +} + +func (r *OCFReader) editAvroSchema(e schemaEdit) error { + var err error + switch e.method { + case "set": + r.avroSchema, err = sjson.Set(r.avroSchema, e.path, e.value) + if err != nil { + return fmt.Errorf("%w: schema edit 'set %s = %v' failure - %v", arrow.ErrInvalid, e.path, e.value, err) + } + case "delete": + r.avroSchema, err = sjson.Delete(r.avroSchema, e.path) + if err != nil { + return fmt.Errorf("%w: schema edit 'delete' failure - %v", arrow.ErrInvalid, err) + } + default: + return fmt.Errorf("%w: schema edit method must be 'set' or 'delete'", arrow.ErrInvalid) + } + return nil +} + +// Next returns whether a Record can be received from the converted record queue. +// The user should check Err() after call to Next that return false to check +// if an error took place. +func (r *OCFReader) Next() bool { + if r.cur != nil { + r.cur.Release() + r.cur = nil + } + if r.maxOCF < len(r.avroChan) { + r.maxOCF = len(r.avroChan) + } + if r.maxRec < len(r.recChan) { + r.maxRec = len(r.recChan) + } + select { + case r.cur = <-r.recChan: + case <-r.bldDone: + if len(r.recChan) > 0 { + r.cur = <-r.recChan + } + } + if r.err != nil { + return false + } + + return r.cur != nil +} + +// WithAllocator specifies the Arrow memory allocator used while building records. +func WithAllocator(mem memory.Allocator) Option { + return func(cfg config) { + cfg.mem = mem + } +} + +// WithReadCacheSize specifies the size of the OCF record decode queue, default value +// is 500. +func WithReadCacheSize(n int) Option { + return func(cfg config) { + if n < 1 { + cfg.avroChanSize = 500 + } else { + cfg.avroChanSize = n + } + } +} + +// WithRecordCacheSize specifies the size of the converted Arrow record queue, default +// value is 1. +func WithRecordCacheSize(n int) Option { + return func(cfg config) { + if n < 1 { + cfg.recChanSize = 1 + } else { + cfg.recChanSize = n + } + } +} + +// WithSchemaEdit specifies modifications to the Avro schema. Supported methods are 'set' and +// 'delete'. Set sets the value for the specified path. Delete deletes the value for the specified path. +// A path is in dot syntax, such as "fields.1" or "fields.0.type". The modified Avro schema is +// validated before conversion to Arrow schema - NewOCFReader will return an error if the modified schema +// cannot be parsed. +func WithSchemaEdit(method, path string, value any) Option { + return func(cfg config) { + var e schemaEdit + e.method = method + e.path = path + e.value = value + cfg.avroSchemaEdits = append(cfg.avroSchemaEdits, e) + } +} + +// WithChunk specifies the chunk size used while reading Avro OCF files. +// +// If n is zero or 1, no chunking will take place and the reader will create +// one record per row. +// If n is greater than 1, chunks of n rows will be read. +// If n is negative, the reader will load the whole Avro OCF file into memory and +// create one big record with all the rows. +func WithChunk(n int) Option { + return func(cfg config) { + cfg.chunk = n + } +} + +// Retain increases the reference count by 1. +// Retain may be called simultaneously from multiple goroutines. +func (r *OCFReader) Retain() { + atomic.AddInt64(&r.refs, 1) +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +// Release may be called simultaneously from multiple goroutines. +func (r *OCFReader) Release() { + debug.Assert(atomic.LoadInt64(&r.refs) > 0, "too many releases") + + if atomic.AddInt64(&r.refs, -1) == 0 { + if r.cur != nil { + r.cur.Release() + } + } +} + +var _ array.RecordReader = (*OCFReader)(nil) diff --git a/go/arrow/avro/reader_test.go b/go/arrow/avro/reader_test.go new file mode 100644 index 0000000000000..e94d4f48fb933 --- /dev/null +++ b/go/arrow/avro/reader_test.go @@ -0,0 +1,364 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package avro + +import ( + "fmt" + "testing" + + "github.com/apache/arrow/go/v15/arrow" + hamba "github.com/hamba/avro/v2" +) + +func TestEditSchemaStringEqual(t *testing.T) { + tests := []struct { + avroSchema string + arrowSchema []arrow.Field + }{ + { + avroSchema: `{ + "fields": [ + { + "name": "inheritNull", + "type": { + "name": "Simple", + "symbols": [ + "a", + "b" + ], + "type": "enum" + } + }, + { + "name": "explicitNamespace", + "type": { + "name": "test", + "namespace": "org.hamba.avro", + "size": 12, + "type": "fixed" + } + }, + { + "name": "fullName", + "type": { + "type": "record", + "name": "fullName_data", + "namespace": "ignored", + "doc": "A name attribute with a fullname, so the namespace attribute is ignored. The fullname is 'a.full.Name', and the namespace is 'a.full'.", + "fields": [{ + "name": "inheritNamespace", + "type": { + "type": "enum", + "name": "Understanding", + "doc": "A simple name (attribute) and no namespace attribute: inherit the namespace of the enclosing type 'a.full.Name'. The fullname is 'a.full.Understanding'.", + "symbols": ["d", "e"] + } + }, { + "name": "md5", + "type": { + "name": "md5_data", + "type": "fixed", + "size": 16, + "namespace": "ignored" + } + } + ] + } + }, + { + "name": "id", + "type": "int" + }, + { + "name": "bigId", + "type": "long" + }, + { + "name": "temperature", + "type": [ + "null", + "float" + ] + }, + { + "name": "fraction", + "type": [ + "null", + "double" + ] + }, + { + "name": "is_emergency", + "type": "boolean" + }, + { + "name": "remote_ip", + "type": [ + "null", + "bytes" + ] + }, + { + "name": "person", + "type": { + "fields": [ + { + "name": "lastname", + "type": "string" + }, + { + "name": "address", + "type": { + "fields": [ + { + "name": "streetaddress", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ], + "name": "AddressUSRecord", + "type": "record" + } + }, + { + "name": "mapfield", + "type": { + "default": { + }, + "type": "map", + "values": "long" + } + }, + { + "name": "arrayField", + "type": { + "default": [ + ], + "items": "string", + "type": "array" + } + } + ], + "name": "person_data", + "type": "record" + } + }, + { + "name": "decimalField", + "type": { + "logicalType": "decimal", + "precision": 4, + "scale": 2, + "type": "bytes" + } + }, + { + "logicalType": "uuid", + "name": "uuidField", + "type": "string" + }, + { + "name": "timemillis", + "type": { + "type": "int", + "logicalType": "time-millis" + } + }, + { + "name": "timemicros", + "type": { + "type": "long", + "logicalType": "time-micros" + } + }, + { + "name": "timestampmillis", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "timestampmicros", + "type": { + "type": "long", + "logicalType": "timestamp-micros" + } + }, + { + "name": "duration", + "type": { + "name": "duration", + "namespace": "whyowhy", + "logicalType": "duration", + "size": 12, + "type": "fixed" + } + }, + { + "name": "date", + "type": { + "logicalType": "date", + "type": "int" + } + } + ], + "name": "Example", + "type": "record" + }`, + arrowSchema: []arrow.Field{ + { + Name: "explicitNamespace", + Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, + }, + { + Name: "fullName", + Type: arrow.StructOf( + arrow.Field{ + Name: "inheritNamespace", + Type: &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint8, ValueType: arrow.BinaryTypes.String, Ordered: false}, + }, + arrow.Field{ + Name: "md5", + Type: &arrow.FixedSizeBinaryType{ByteWidth: 16}, + }, + ), + }, + { + Name: "id", + Type: arrow.PrimitiveTypes.Int32, + }, + { + Name: "bigId", + Type: arrow.PrimitiveTypes.Int64, + }, + { + Name: "temperature", + Type: arrow.PrimitiveTypes.Float32, + Nullable: true, + }, + { + Name: "fraction", + Type: arrow.PrimitiveTypes.Float64, + Nullable: true, + }, + { + Name: "is_emergency", + Type: arrow.FixedWidthTypes.Boolean, + }, + { + Name: "remote_ip", + Type: arrow.BinaryTypes.Binary, + Nullable: true, + }, + { + Name: "person", + Type: arrow.StructOf( + arrow.Field{ + Name: "lastname", + Type: arrow.BinaryTypes.String, + }, + arrow.Field{ + Name: "address", + Type: arrow.StructOf( + arrow.Field{ + Name: "streetaddress", + Type: arrow.BinaryTypes.String, + }, + arrow.Field{ + Name: "city", + Type: arrow.BinaryTypes.String, + }, + ), + }, + arrow.Field{ + Name: "mapfield", + Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), + Nullable: true, + }, + arrow.Field{ + Name: "arrayField", + Type: arrow.ListOfNonNullable(arrow.BinaryTypes.String), + }, + ), + }, + { + Name: "decimalField", + Type: &arrow.Decimal128Type{Precision: 4, Scale: 2}, + }, + { + Name: "uuidField", + Type: arrow.BinaryTypes.String, + }, + { + Name: "timemillis", + Type: arrow.FixedWidthTypes.Time32ms, + }, + { + Name: "timemicros", + Type: arrow.FixedWidthTypes.Time64us, + }, + { + Name: "timestampmillis", + Type: arrow.FixedWidthTypes.Timestamp_ms, + }, + { + Name: "timestampmicros", + Type: arrow.FixedWidthTypes.Timestamp_us, + }, + { + Name: "duration", + Type: arrow.FixedWidthTypes.MonthDayNanoInterval, + }, + { + Name: "date", + Type: arrow.FixedWidthTypes.Date32, + }, + }, + }, + } + + for _, test := range tests { + t.Run("", func(t *testing.T) { + want := arrow.NewSchema(test.arrowSchema, nil) + + schema, err := hamba.ParseBytes([]byte(test.avroSchema)) + if err != nil { + t.Fatalf("%v", err) + } + r := new(OCFReader) + r.avroSchema = schema.String() + r.editAvroSchema(schemaEdit{method: "delete", path: "fields.0"}) + schema, err = hamba.Parse(r.avroSchema) + if err != nil { + t.Fatalf("%v: could not parse modified avro schema", arrow.ErrInvalid) + } + got, err := ArrowSchemaFromAvro(schema) + if err != nil { + t.Fatalf("%v", err) + } + if !(fmt.Sprintf("%+v", want.String()) == fmt.Sprintf("%+v", got.String())) { + t.Fatalf("got=%v,\n want=%v", got.String(), want.String()) + } else { + t.Logf("schema.String() comparison passed") + } + }) + } +} diff --git a/go/arrow/avro/reader_types.go b/go/arrow/avro/reader_types.go new file mode 100644 index 0000000000000..5658c6e587db2 --- /dev/null +++ b/go/arrow/avro/reader_types.go @@ -0,0 +1,875 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package avro + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "math/big" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/array" + "github.com/apache/arrow/go/v15/arrow/decimal128" + "github.com/apache/arrow/go/v15/arrow/decimal256" + "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v15/internal/types" +) + +type dataLoader struct { + idx, depth int32 + list *fieldPos + item *fieldPos + mapField *fieldPos + mapKey *fieldPos + mapValue *fieldPos + fields []*fieldPos + children []*dataLoader +} + +var ( + ErrNullStructData = errors.New("null struct data") +) + +func newDataLoader() *dataLoader { return &dataLoader{idx: 0, depth: 0} } + +// drawTree takes the tree of field builders produced by mapFieldBuilders() +// and produces another tree structure and aggregates fields whose values can +// be retrieved from a `map[string]any` into a slice of builders, and creates a hierarchy to +// deal with nested types (lists and maps). +func (d *dataLoader) drawTree(field *fieldPos) { + for _, f := range field.children() { + if f.isList || f.isMap { + if f.isList { + c := d.newListChild(f) + if !f.childrens[0].isList { + c.item = f.childrens[0] + c.drawTree(f.childrens[0]) + } else { + c.drawTree(f.childrens[0].childrens[0]) + } + } + if f.isMap { + c := d.newMapChild(f) + if !arrow.IsNested(f.childrens[1].builder.Type().ID()) { + c.mapKey = f.childrens[0] + c.mapValue = f.childrens[1] + } else { + c.mapKey = f.childrens[0] + m := c.newChild() + m.mapValue = f.childrens[1] + m.drawTree(f.childrens[1]) + } + } + } else { + d.fields = append(d.fields, f) + if len(f.children()) > 0 { + d.drawTree(f) + } + } + } +} + +// loadDatum loads decoded Avro data to the schema fields' builder functions. +// Since array.StructBuilder.AppendNull() will recursively append null to all of the +// struct's fields, in the case of nil being passed to a struct's builderFunc it will +// return a ErrNullStructData error to signal that all its sub-fields can be skipped. +func (d *dataLoader) loadDatum(data any) error { + if d.list == nil && d.mapField == nil { + if d.mapValue != nil { + d.mapValue.appendFunc(data) + } + var NullParent *fieldPos + for _, f := range d.fields { + if f.parent == NullParent { + continue + } + if d.mapValue == nil { + err := f.appendFunc(f.getValue(data)) + if err != nil { + if err == ErrNullStructData { + NullParent = f + continue + } + return err + } + } else { + switch dt := data.(type) { + case nil: + err := f.appendFunc(dt) + if err != nil { + if err == ErrNullStructData { + NullParent = f + continue + } + return err + } + case []any: + if len(d.children) < 1 { + for _, e := range dt { + err := f.appendFunc(e) + if err != nil { + if err == ErrNullStructData { + NullParent = f + continue + } + return err + } + } + } else { + for _, e := range dt { + d.children[0].loadDatum(e) + } + } + case map[string]any: + err := f.appendFunc(f.getValue(dt)) + if err != nil { + if err == ErrNullStructData { + NullParent = f + continue + } + return err + } + } + + } + } + for _, c := range d.children { + if c.list != nil { + c.loadDatum(c.list.getValue(data)) + } + if c.mapField != nil { + switch dt := data.(type) { + case nil: + c.loadDatum(dt) + case map[string]any: + c.loadDatum(c.mapField.getValue(dt)) + default: + c.loadDatum(c.mapField.getValue(data)) + } + } + } + } else { + if d.list != nil { + switch dt := data.(type) { + case nil: + d.list.appendFunc(dt) + case []any: + d.list.appendFunc(dt) + for _, e := range dt { + if d.item != nil { + d.item.appendFunc(e) + } + var NullParent *fieldPos + for _, f := range d.fields { + if f.parent == NullParent { + continue + } + err := f.appendFunc(f.getValue(e)) + if err != nil { + if err == ErrNullStructData { + NullParent = f + continue + } + return err + } + } + for _, c := range d.children { + if c.list != nil { + c.loadDatum(c.list.getValue(e)) + } + if c.mapField != nil { + c.loadDatum(c.mapField.getValue(e)) + } + } + } + case map[string]any: + d.list.appendFunc(dt["array"]) + for _, e := range dt["array"].([]any) { + if d.item != nil { + d.item.appendFunc(e) + } + var NullParent *fieldPos + for _, f := range d.fields { + if f.parent == NullParent { + continue + } + err := f.appendFunc(f.getValue(e)) + if err != nil { + if err == ErrNullStructData { + NullParent = f + continue + } + return err + } + } + for _, c := range d.children { + c.loadDatum(c.list.getValue(e)) + } + } + default: + d.list.appendFunc(data) + d.item.appendFunc(dt) + } + } + if d.mapField != nil { + switch dt := data.(type) { + case nil: + d.mapField.appendFunc(dt) + case map[string]any: + + d.mapField.appendFunc(dt) + for k, v := range dt { + d.mapKey.appendFunc(k) + if d.mapValue != nil { + d.mapValue.appendFunc(v) + } else { + d.children[0].loadDatum(v) + } + } + } + } + } + return nil +} + +func (d *dataLoader) newChild() *dataLoader { + var child *dataLoader = &dataLoader{ + depth: d.depth + 1, + } + d.children = append(d.children, child) + return child +} + +func (d *dataLoader) newListChild(list *fieldPos) *dataLoader { + var child *dataLoader = &dataLoader{ + list: list, + item: list.childrens[0], + depth: d.depth + 1, + } + d.children = append(d.children, child) + return child +} + +func (d *dataLoader) newMapChild(mapField *fieldPos) *dataLoader { + var child *dataLoader = &dataLoader{ + mapField: mapField, + depth: d.depth + 1, + } + d.children = append(d.children, child) + return child +} + +type fieldPos struct { + parent *fieldPos + fieldName string + builder array.Builder + path []string + isList bool + isItem bool + isStruct bool + isMap bool + typeName string + appendFunc func(val interface{}) error + metadatas arrow.Metadata + childrens []*fieldPos + index, depth int32 +} + +func newFieldPos() *fieldPos { return &fieldPos{index: -1} } + +func (f *fieldPos) children() []*fieldPos { return f.childrens } + +func (f *fieldPos) newChild(childName string, childBuilder array.Builder, meta arrow.Metadata) *fieldPos { + var child fieldPos = fieldPos{ + parent: f, + fieldName: childName, + builder: childBuilder, + metadatas: meta, + index: int32(len(f.childrens)), + depth: f.depth + 1, + } + if f.isList { + child.isItem = true + } + child.path = child.buildNamePath() + f.childrens = append(f.childrens, &child) + return &child +} + +func (f *fieldPos) buildNamePath() []string { + var path []string + var listPath []string + cur := f + for i := f.depth - 1; i >= 0; i-- { + if cur.typeName == "" { + path = append([]string{cur.fieldName}, path...) + } else { + path = append([]string{cur.fieldName, cur.typeName}, path...) + } + if !cur.parent.isMap { + cur = cur.parent + } + } + if f.parent.parent != nil && f.parent.parent.isList { + for i := len(path) - 1; i >= 0; i-- { + if path[i] != "item" { + listPath = append([]string{path[i]}, listPath...) + } else { + return listPath + } + } + } + if f.parent != nil && f.parent.fieldName == "value" { + for i := len(path) - 1; i >= 0; i-- { + if path[i] != "value" { + listPath = append([]string{path[i]}, listPath...) + } else { + return listPath + } + } + } + return path +} + +// NamePath returns a slice of keys making up the path to the field +func (f *fieldPos) namePath() []string { return f.path } + +// GetValue retrieves the value from the map[string]any +// by following the field's key path +func (f *fieldPos) getValue(m any) any { + if _, ok := m.(map[string]any); !ok { + return m + } + for _, key := range f.namePath() { + valueMap, ok := m.(map[string]any) + if !ok { + if key == "item" { + return m + } + return nil + } + m, ok = valueMap[key] + if !ok { + return nil + } + } + return m +} + +// Avro data is loaded to Arrow arrays using the following type mapping: +// +// Avro Go Arrow +// null nil Null +// boolean bool Boolean +// bytes []byte Binary +// float float32 Float32 +// double float64 Float64 +// long int64 Int64 +// int int32 Int32 +// string string String +// array []interface{} List +// enum string Dictionary +// fixed []byte FixedSizeBinary +// map and record map[string]any Struct +// +// mapFieldBuilders builds a tree of field builders matching the Arrow schema +func mapFieldBuilders(b array.Builder, field arrow.Field, parent *fieldPos) { + f := parent.newChild(field.Name, b, field.Metadata) + switch bt := b.(type) { + case *array.BinaryBuilder: + f.appendFunc = func(data interface{}) error { + appendBinaryData(bt, data) + return nil + } + case *array.BinaryDictionaryBuilder: + // has metadata for Avro enum symbols + f.appendFunc = func(data interface{}) error { + appendBinaryDictData(bt, data) + return nil + } + // add Avro enum symbols to builder + sb := array.NewStringBuilder(memory.DefaultAllocator) + for _, v := range field.Metadata.Values() { + sb.Append(v) + } + sa := sb.NewStringArray() + bt.InsertStringDictValues(sa) + case *array.BooleanBuilder: + f.appendFunc = func(data interface{}) error { + appendBoolData(bt, data) + return nil + } + case *array.Date32Builder: + f.appendFunc = func(data interface{}) error { + appendDate32Data(bt, data) + return nil + } + case *array.Decimal128Builder: + f.appendFunc = func(data interface{}) error { + err := appendDecimal128Data(bt, data) + if err != nil { + return err + } + return nil + } + case *array.Decimal256Builder: + f.appendFunc = func(data interface{}) error { + err := appendDecimal256Data(bt, data) + if err != nil { + return err + } + return nil + } + case *types.UUIDBuilder: + f.appendFunc = func(data interface{}) error { + switch dt := data.(type) { + case nil: + bt.AppendNull() + case string: + err := bt.AppendValueFromString(dt) + if err != nil { + return err + } + case []byte: + err := bt.AppendValueFromString(string(dt)) + if err != nil { + return err + } + } + return nil + } + case *array.FixedSizeBinaryBuilder: + f.appendFunc = func(data interface{}) error { + appendFixedSizeBinaryData(bt, data) + return nil + } + case *array.Float32Builder: + f.appendFunc = func(data interface{}) error { + appendFloat32Data(bt, data) + return nil + } + case *array.Float64Builder: + f.appendFunc = func(data interface{}) error { + appendFloat64Data(bt, data) + return nil + } + case *array.Int32Builder: + f.appendFunc = func(data interface{}) error { + appendInt32Data(bt, data) + return nil + } + case *array.Int64Builder: + f.appendFunc = func(data interface{}) error { + appendInt64Data(bt, data) + return nil + } + case *array.LargeListBuilder: + vb := bt.ValueBuilder() + f.isList = true + mapFieldBuilders(vb, field.Type.(*arrow.LargeListType).ElemField(), f) + f.appendFunc = func(data interface{}) error { + switch dt := data.(type) { + case nil: + bt.AppendNull() + case []interface{}: + if len(dt) == 0 { + bt.AppendEmptyValue() + } else { + bt.Append(true) + } + default: + bt.Append(true) + } + return nil + } + case *array.ListBuilder: + vb := bt.ValueBuilder() + f.isList = true + mapFieldBuilders(vb, field.Type.(*arrow.ListType).ElemField(), f) + f.appendFunc = func(data interface{}) error { + switch dt := data.(type) { + case nil: + bt.AppendNull() + case []interface{}: + if len(dt) == 0 { + bt.AppendEmptyValue() + } else { + bt.Append(true) + } + default: + bt.Append(true) + } + return nil + } + case *array.MapBuilder: + // has metadata for objects in values + f.isMap = true + kb := bt.KeyBuilder() + ib := bt.ItemBuilder() + mapFieldBuilders(kb, field.Type.(*arrow.MapType).KeyField(), f) + mapFieldBuilders(ib, field.Type.(*arrow.MapType).ItemField(), f) + f.appendFunc = func(data interface{}) error { + switch data.(type) { + case nil: + bt.AppendNull() + default: + bt.Append(true) + } + return nil + } + case *array.MonthDayNanoIntervalBuilder: + f.appendFunc = func(data interface{}) error { + appendDurationData(bt, data) + return nil + } + case *array.StringBuilder: + f.appendFunc = func(data interface{}) error { + appendStringData(bt, data) + return nil + } + case *array.StructBuilder: + // has metadata for Avro Union named types + f.typeName, _ = field.Metadata.GetValue("typeName") + f.isStruct = true + // create children + for i, p := range field.Type.(*arrow.StructType).Fields() { + mapFieldBuilders(bt.FieldBuilder(i), p, f) + } + f.appendFunc = func(data interface{}) error { + switch data.(type) { + case nil: + bt.AppendNull() + return ErrNullStructData + default: + bt.Append(true) + } + return nil + } + case *array.Time32Builder: + f.appendFunc = func(data interface{}) error { + appendTime32Data(bt, data) + return nil + } + case *array.Time64Builder: + f.appendFunc = func(data interface{}) error { + appendTime64Data(bt, data) + return nil + } + case *array.TimestampBuilder: + f.appendFunc = func(data interface{}) error { + appendTimestampData(bt, data) + return nil + } + } +} + +func appendBinaryData(b *array.BinaryBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case map[string]any: + switch ct := dt["bytes"].(type) { + case nil: + b.AppendNull() + default: + b.Append(ct.([]byte)) + } + default: + b.Append(fmt.Append([]byte{}, data)) + } +} + +func appendBinaryDictData(b *array.BinaryDictionaryBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case string: + b.AppendString(dt) + case map[string]any: + switch v := dt["string"].(type) { + case nil: + b.AppendNull() + case string: + b.AppendString(v) + } + } +} + +func appendBoolData(b *array.BooleanBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case bool: + b.Append(dt) + case map[string]any: + switch v := dt["boolean"].(type) { + case nil: + b.AppendNull() + case bool: + b.Append(v) + } + } +} + +func appendDate32Data(b *array.Date32Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case int32: + b.Append(arrow.Date32(dt)) + case map[string]any: + switch v := dt["int"].(type) { + case nil: + b.AppendNull() + case int32: + b.Append(arrow.Date32(v)) + } + } +} + +func appendDecimal128Data(b *array.Decimal128Builder, data interface{}) error { + switch dt := data.(type) { + case nil: + b.AppendNull() + case []byte: + buf := bytes.NewBuffer(dt) + if len(dt) <= 38 { + var intData int64 + err := binary.Read(buf, binary.BigEndian, &intData) + if err != nil { + return err + } + b.Append(decimal128.FromI64(intData)) + } else { + var bigIntData big.Int + b.Append(decimal128.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) + } + case map[string]any: + buf := bytes.NewBuffer(dt["bytes"].([]byte)) + if len(dt["bytes"].([]byte)) <= 38 { + var intData int64 + err := binary.Read(buf, binary.BigEndian, &intData) + if err != nil { + return err + } + b.Append(decimal128.FromI64(intData)) + } else { + var bigIntData big.Int + b.Append(decimal128.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) + } + } + return nil +} + +func appendDecimal256Data(b *array.Decimal256Builder, data interface{}) error { + switch dt := data.(type) { + case nil: + b.AppendNull() + case []byte: + var bigIntData big.Int + buf := bytes.NewBuffer(dt) + b.Append(decimal256.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) + case map[string]any: + var bigIntData big.Int + buf := bytes.NewBuffer(dt["bytes"].([]byte)) + b.Append(decimal256.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) + } + return nil +} + +// Avro duration logical type annotates Avro fixed type of size 12, which stores three little-endian +// unsigned integers that represent durations at different granularities of time. The first stores +// a number in months, the second stores a number in days, and the third stores a number in milliseconds. +func appendDurationData(b *array.MonthDayNanoIntervalBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case []byte: + dur := new(arrow.MonthDayNanoInterval) + dur.Months = int32(binary.LittleEndian.Uint16(dt[:3])) + dur.Days = int32(binary.LittleEndian.Uint16(dt[4:7])) + dur.Nanoseconds = int64(binary.LittleEndian.Uint32(dt[8:]) * 1000000) + b.Append(*dur) + case map[string]any: + switch dtb := dt["bytes"].(type) { + case nil: + b.AppendNull() + case []byte: + dur := new(arrow.MonthDayNanoInterval) + dur.Months = int32(binary.LittleEndian.Uint16(dtb[:3])) + dur.Days = int32(binary.LittleEndian.Uint16(dtb[4:7])) + dur.Nanoseconds = int64(binary.LittleEndian.Uint32(dtb[8:]) * 1000000) + b.Append(*dur) + } + } +} + +func appendFixedSizeBinaryData(b *array.FixedSizeBinaryBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case []byte: + b.Append(dt) + case map[string]any: + switch v := dt["bytes"].(type) { + case nil: + b.AppendNull() + case []byte: + b.Append(v) + } + } +} + +func appendFloat32Data(b *array.Float32Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case float32: + b.Append(dt) + case map[string]any: + switch v := dt["float"].(type) { + case nil: + b.AppendNull() + case float32: + b.Append(v) + } + } +} + +func appendFloat64Data(b *array.Float64Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case float64: + b.Append(dt) + case map[string]any: + switch v := dt["double"].(type) { + case nil: + b.AppendNull() + case float64: + b.Append(v) + } + } +} + +func appendInt32Data(b *array.Int32Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case int: + b.Append(int32(dt)) + case int32: + b.Append(dt) + case map[string]any: + switch v := dt["int"].(type) { + case nil: + b.AppendNull() + case int: + b.Append(int32(v)) + case int32: + b.Append(v) + } + } +} + +func appendInt64Data(b *array.Int64Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case int: + b.Append(int64(dt)) + case int64: + b.Append(dt) + case map[string]any: + switch v := dt["long"].(type) { + case nil: + b.AppendNull() + case int: + b.Append(int64(v)) + case int64: + b.Append(v) + } + } +} + +func appendStringData(b *array.StringBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case string: + b.Append(dt) + case map[string]any: + switch v := dt["string"].(type) { + case nil: + b.AppendNull() + case string: + b.Append(v) + } + default: + b.Append(fmt.Sprint(data)) + } +} + +func appendTime32Data(b *array.Time32Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case int32: + b.Append(arrow.Time32(dt)) + case map[string]any: + switch v := dt["int"].(type) { + case nil: + b.AppendNull() + case int32: + b.Append(arrow.Time32(v)) + } + } +} + +func appendTime64Data(b *array.Time64Builder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case int64: + b.Append(arrow.Time64(dt)) + case map[string]any: + switch v := dt["long"].(type) { + case nil: + b.AppendNull() + case int64: + b.Append(arrow.Time64(v)) + } + } +} + +func appendTimestampData(b *array.TimestampBuilder, data interface{}) { + switch dt := data.(type) { + case nil: + b.AppendNull() + case int64: + b.Append(arrow.Timestamp(dt)) + case map[string]any: + switch v := dt["long"].(type) { + case nil: + b.AppendNull() + case int64: + b.Append(arrow.Timestamp(v)) + } + } +} diff --git a/go/arrow/avro/schema.go b/go/arrow/avro/schema.go new file mode 100644 index 0000000000000..32e37096c68f2 --- /dev/null +++ b/go/arrow/avro/schema.go @@ -0,0 +1,429 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package avro reads Avro OCF files and presents the extracted data as records +package avro + +import ( + "fmt" + "math" + "strconv" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/decimal128" + "github.com/apache/arrow/go/v15/internal/types" + avro "github.com/hamba/avro/v2" +) + +type schemaNode struct { + name string + parent *schemaNode + schema avro.Schema + union bool + nullable bool + childrens []*schemaNode + arrowField arrow.Field + schemaCache *avro.SchemaCache + index, depth int32 +} + +func newSchemaNode() *schemaNode { + var schemaCache avro.SchemaCache + return &schemaNode{name: "", index: -1, schemaCache: &schemaCache} +} + +func (node *schemaNode) schemaPath() string { + var path string + n := node + for n.parent != nil { + path = "." + n.name + path + n = n.parent + } + return path +} + +func (node *schemaNode) newChild(n string, s avro.Schema) *schemaNode { + child := &schemaNode{ + name: n, + parent: node, + schema: s, + schemaCache: node.schemaCache, + index: int32(len(node.childrens)), + depth: node.depth + 1, + } + node.childrens = append(node.childrens, child) + return child +} +func (node *schemaNode) children() []*schemaNode { return node.childrens } + +// func (node *schemaNode) nodeName() string { return node.name } + +// ArrowSchemaFromAvro returns a new Arrow schema from an Avro schema +func ArrowSchemaFromAvro(schema avro.Schema) (s *arrow.Schema, err error) { + defer func() { + if r := recover(); r != nil { + s = nil + switch x := r.(type) { + case string: + err = fmt.Errorf("invalid avro schema: %s", x) + case error: + err = fmt.Errorf("invalid avro schema: %w", x) + default: + err = fmt.Errorf("invalid avro schema: unknown error") + } + } + }() + n := newSchemaNode() + n.schema = schema + c := n.newChild(n.schema.(avro.NamedSchema).Name(), n.schema) + arrowSchemafromAvro(c) + var fields []arrow.Field + for _, g := range c.children() { + fields = append(fields, g.arrowField) + } + s = arrow.NewSchema(fields, nil) + return s, nil +} + +func arrowSchemafromAvro(n *schemaNode) { + if ns, ok := n.schema.(avro.NamedSchema); ok { + n.schemaCache.Add(ns.Name(), ns) + } + switch st := n.schema.Type(); st { + case "record": + iterateFields(n) + case "enum": + n.schemaCache.Add(n.schema.(avro.NamedSchema).Name(), n.schema.(*avro.EnumSchema)) + symbols := make(map[string]string) + for index, symbol := range n.schema.(avro.PropertySchema).(*avro.EnumSchema).Symbols() { + k := strconv.FormatInt(int64(index), 10) + symbols[k] = symbol + } + var dt arrow.DictionaryType = arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint64, ValueType: arrow.BinaryTypes.String, Ordered: false} + sl := int64(len(symbols)) + switch { + case sl <= math.MaxUint8: + dt.IndexType = arrow.PrimitiveTypes.Uint8 + case sl > math.MaxUint8 && sl <= math.MaxUint16: + dt.IndexType = arrow.PrimitiveTypes.Uint16 + case sl > math.MaxUint16 && sl <= math.MaxUint32: + dt.IndexType = arrow.PrimitiveTypes.Uint32 + } + n.arrowField = buildArrowField(n, &dt, arrow.MetadataFrom(symbols)) + case "array": + // logical items type + c := n.newChild(n.name, n.schema.(*avro.ArraySchema).Items()) + if isLogicalSchemaType(n.schema.(*avro.ArraySchema).Items()) { + avroLogicalToArrowField(c) + } else { + arrowSchemafromAvro(c) + } + switch c.arrowField.Nullable { + case true: + n.arrowField = arrow.Field{Name: n.name, Type: arrow.ListOfField(c.arrowField), Metadata: c.arrowField.Metadata} + case false: + n.arrowField = arrow.Field{Name: n.name, Type: arrow.ListOfNonNullable(c.arrowField.Type), Metadata: c.arrowField.Metadata} + } + case "map": + n.schemaCache.Add(n.schema.(*avro.MapSchema).Values().(avro.NamedSchema).Name(), n.schema.(*avro.MapSchema).Values()) + c := n.newChild(n.name, n.schema.(*avro.MapSchema).Values()) + arrowSchemafromAvro(c) + n.arrowField = buildArrowField(n, arrow.MapOf(arrow.BinaryTypes.String, c.arrowField.Type), c.arrowField.Metadata) + case "union": + if n.schema.(*avro.UnionSchema).Nullable() { + if len(n.schema.(*avro.UnionSchema).Types()) > 1 { + n.schema = n.schema.(*avro.UnionSchema).Types()[1] + n.union = true + n.nullable = true + arrowSchemafromAvro(n) + } + } + // Avro "fixed" field type = Arrow FixedSize Primitive BinaryType + case "fixed": + n.schemaCache.Add(n.schema.(avro.NamedSchema).Name(), n.schema.(*avro.FixedSchema)) + if isLogicalSchemaType(n.schema) { + avroLogicalToArrowField(n) + } else { + n.arrowField = buildArrowField(n, &arrow.FixedSizeBinaryType{ByteWidth: n.schema.(*avro.FixedSchema).Size()}, arrow.Metadata{}) + } + case "string", "bytes", "int", "long": + if isLogicalSchemaType(n.schema) { + avroLogicalToArrowField(n) + } else { + n.arrowField = buildArrowField(n, avroPrimitiveToArrowType(string(st)), arrow.Metadata{}) + } + case "float", "double", "boolean": + n.arrowField = arrow.Field{Name: n.name, Type: avroPrimitiveToArrowType(string(st)), Nullable: n.nullable} + case "": + refSchema := n.schemaCache.Get(string(n.schema.(*avro.RefSchema).Schema().Name())) + if refSchema == nil { + panic(fmt.Errorf("could not find schema for '%v' in schema cache - %v", n.schemaPath(), n.schema.(*avro.RefSchema).Schema().Name())) + } + n.schema = refSchema + arrowSchemafromAvro(n) + case "null": + n.schemaCache.Add(n.schema.(*avro.MapSchema).Values().(avro.NamedSchema).Name(), &avro.NullSchema{}) + n.nullable = true + n.arrowField = buildArrowField(n, arrow.Null, arrow.Metadata{}) + } +} + +// iterate record Fields() +func iterateFields(n *schemaNode) { + for _, f := range n.schema.(*avro.RecordSchema).Fields() { + switch ft := f.Type().(type) { + // Avro "array" field type + case *avro.ArraySchema: + n.schemaCache.Add(f.Name(), ft.Items()) + // logical items type + c := n.newChild(f.Name(), ft.Items()) + if isLogicalSchemaType(ft.Items()) { + avroLogicalToArrowField(c) + } else { + arrowSchemafromAvro(c) + } + switch c.arrowField.Nullable { + case true: + c.arrowField = arrow.Field{Name: c.name, Type: arrow.ListOfField(c.arrowField), Metadata: c.arrowField.Metadata} + case false: + c.arrowField = arrow.Field{Name: c.name, Type: arrow.ListOfNonNullable(c.arrowField.Type), Metadata: c.arrowField.Metadata} + } + // Avro "enum" field type = Arrow dictionary type + case *avro.EnumSchema: + n.schemaCache.Add(f.Type().(*avro.EnumSchema).Name(), f.Type()) + c := n.newChild(f.Name(), f.Type()) + symbols := make(map[string]string) + for index, symbol := range ft.Symbols() { + k := strconv.FormatInt(int64(index), 10) + symbols[k] = symbol + } + var dt arrow.DictionaryType = arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint64, ValueType: arrow.BinaryTypes.String, Ordered: false} + sl := len(symbols) + switch { + case sl <= math.MaxUint8: + dt.IndexType = arrow.PrimitiveTypes.Uint8 + case sl > math.MaxUint8 && sl <= math.MaxUint16: + dt.IndexType = arrow.PrimitiveTypes.Uint16 + case sl > math.MaxUint16 && sl <= math.MaxInt: + dt.IndexType = arrow.PrimitiveTypes.Uint32 + } + c.arrowField = buildArrowField(c, &dt, arrow.MetadataFrom(symbols)) + // Avro "fixed" field type = Arrow FixedSize Primitive BinaryType + case *avro.FixedSchema: + n.schemaCache.Add(f.Name(), f.Type()) + c := n.newChild(f.Name(), f.Type()) + if isLogicalSchemaType(f.Type()) { + avroLogicalToArrowField(c) + } else { + arrowSchemafromAvro(c) + } + case *avro.RecordSchema: + n.schemaCache.Add(f.Name(), f.Type()) + c := n.newChild(f.Name(), f.Type()) + iterateFields(c) + // Avro "map" field type - KVP with value of one type - keys are strings + case *avro.MapSchema: + n.schemaCache.Add(f.Name(), ft.Values()) + c := n.newChild(f.Name(), ft.Values()) + arrowSchemafromAvro(c) + c.arrowField = buildArrowField(c, arrow.MapOf(arrow.BinaryTypes.String, c.arrowField.Type), c.arrowField.Metadata) + case *avro.UnionSchema: + if ft.Nullable() { + if len(ft.Types()) > 1 { + n.schemaCache.Add(f.Name(), ft.Types()[1]) + c := n.newChild(f.Name(), ft.Types()[1]) + c.union = true + c.nullable = true + arrowSchemafromAvro(c) + } + } + default: + n.schemaCache.Add(f.Name(), f.Type()) + if isLogicalSchemaType(f.Type()) { + c := n.newChild(f.Name(), f.Type()) + avroLogicalToArrowField(c) + } else { + c := n.newChild(f.Name(), f.Type()) + arrowSchemafromAvro(c) + } + + } + } + var fields []arrow.Field + for _, child := range n.children() { + fields = append(fields, child.arrowField) + } + + namedSchema, ok := isNamedSchema(n.schema) + + var md arrow.Metadata + if ok && namedSchema != n.name+"_data" && n.union { + md = arrow.NewMetadata([]string{"typeName"}, []string{namedSchema}) + } + n.arrowField = buildArrowField(n, arrow.StructOf(fields...), md) +} + +func isLogicalSchemaType(s avro.Schema) bool { + lts, ok := s.(avro.LogicalTypeSchema) + if !ok { + return false + } + if lts.Logical() != nil { + return true + } + return false +} + +func isNamedSchema(s avro.Schema) (string, bool) { + if ns, ok := s.(avro.NamedSchema); ok { + return ns.FullName(), ok + } + return "", false +} + +func buildArrowField(n *schemaNode, t arrow.DataType, m arrow.Metadata) arrow.Field { + return arrow.Field{ + Name: n.name, + Type: t, + Metadata: m, + Nullable: n.nullable, + } +} + +// Avro primitive type. +// +// NOTE: Arrow Binary type is used as a catchall to avoid potential data loss. +func avroPrimitiveToArrowType(avroFieldType string) arrow.DataType { + switch avroFieldType { + // int: 32-bit signed integer + case "int": + return arrow.PrimitiveTypes.Int32 + // long: 64-bit signed integer + case "long": + return arrow.PrimitiveTypes.Int64 + // float: single precision (32-bit) IEEE 754 floating-point number + case "float": + return arrow.PrimitiveTypes.Float32 + // double: double precision (64-bit) IEEE 754 floating-point number + case "double": + return arrow.PrimitiveTypes.Float64 + // bytes: sequence of 8-bit unsigned bytes + case "bytes": + return arrow.BinaryTypes.Binary + // boolean: a binary value + case "boolean": + return arrow.FixedWidthTypes.Boolean + // string: unicode character sequence + case "string": + return arrow.BinaryTypes.String + } + return nil +} + +func avroLogicalToArrowField(n *schemaNode) { + var dt arrow.DataType + // Avro logical types + switch lt := n.schema.(avro.LogicalTypeSchema).Logical(); lt.Type() { + // The decimal logical type represents an arbitrary-precision signed decimal number of the form unscaled × 10-scale. + // A decimal logical type annotates Avro bytes or fixed types. The byte array must contain the two’s-complement + // representation of the unscaled integer value in big-endian byte order. The scale is fixed, and is specified + // using an attribute. + // + // The following attributes are supported: + // scale, a JSON integer representing the scale (optional). If not specified the scale is 0. + // precision, a JSON integer representing the (maximum) precision of decimals stored in this type (required). + case "decimal": + id := arrow.DECIMAL128 + if lt.(*avro.DecimalLogicalSchema).Precision() > decimal128.MaxPrecision { + id = arrow.DECIMAL256 + } + dt, _ = arrow.NewDecimalType(id, int32(lt.(*avro.DecimalLogicalSchema).Precision()), int32(lt.(*avro.DecimalLogicalSchema).Scale())) + + // The uuid logical type represents a random generated universally unique identifier (UUID). + // A uuid logical type annotates an Avro string. The string has to conform with RFC-4122 + case "uuid": + dt = types.NewUUIDType() + + // The date logical type represents a date within the calendar, with no reference to a particular + // time zone or time of day. + // A date logical type annotates an Avro int, where the int stores the number of days from the unix epoch, + // 1 January 1970 (ISO calendar). + case "date": + dt = arrow.FixedWidthTypes.Date32 + + // The time-millis logical type represents a time of day, with no reference to a particular calendar, + // time zone or date, with a precision of one millisecond. + // A time-millis logical type annotates an Avro int, where the int stores the number of milliseconds + // after midnight, 00:00:00.000. + case "time-millis": + dt = arrow.FixedWidthTypes.Time32ms + + // The time-micros logical type represents a time of day, with no reference to a particular calendar, + // time zone or date, with a precision of one microsecond. + // A time-micros logical type annotates an Avro long, where the long stores the number of microseconds + // after midnight, 00:00:00.000000. + case "time-micros": + dt = arrow.FixedWidthTypes.Time64us + + // The timestamp-millis logical type represents an instant on the global timeline, independent of a + // particular time zone or calendar, with a precision of one millisecond. Please note that time zone + // information gets lost in this process. Upon reading a value back, we can only reconstruct the instant, + // but not the original representation. In practice, such timestamps are typically displayed to users in + // their local time zones, therefore they may be displayed differently depending on the execution environment. + // A timestamp-millis logical type annotates an Avro long, where the long stores the number of milliseconds + // from the unix epoch, 1 January 1970 00:00:00.000 UTC. + case "timestamp-millis": + dt = arrow.FixedWidthTypes.Timestamp_ms + + // The timestamp-micros logical type represents an instant on the global timeline, independent of a + // particular time zone or calendar, with a precision of one microsecond. Please note that time zone + // information gets lost in this process. Upon reading a value back, we can only reconstruct the instant, + // but not the original representation. In practice, such timestamps are typically displayed to users + // in their local time zones, therefore they may be displayed differently depending on the execution environment. + // A timestamp-micros logical type annotates an Avro long, where the long stores the number of microseconds + // from the unix epoch, 1 January 1970 00:00:00.000000 UTC. + case "timestamp-micros": + dt = arrow.FixedWidthTypes.Timestamp_us + + // The local-timestamp-millis logical type represents a timestamp in a local timezone, regardless of + // what specific time zone is considered local, with a precision of one millisecond. + // A local-timestamp-millis logical type annotates an Avro long, where the long stores the number of + // milliseconds, from 1 January 1970 00:00:00.000. + // Note: not implemented in hamba/avro + // case "local-timestamp-millis": + // dt = &arrow.TimestampType{Unit: arrow.Millisecond} + + // The local-timestamp-micros logical type represents a timestamp in a local timezone, regardless of + // what specific time zone is considered local, with a precision of one microsecond. + // A local-timestamp-micros logical type annotates an Avro long, where the long stores the number of + // microseconds, from 1 January 1970 00:00:00.000000. + // case "local-timestamp-micros": + // Note: not implemented in hamba/avro + // dt = &arrow.TimestampType{Unit: arrow.Microsecond} + + // The duration logical type represents an amount of time defined by a number of months, days and milliseconds. + // This is not equivalent to a number of milliseconds, because, depending on the moment in time from which the + // duration is measured, the number of days in the month and number of milliseconds in a day may differ. Other + // standard periods such as years, quarters, hours and minutes can be expressed through these basic periods. + + // A duration logical type annotates Avro fixed type of size 12, which stores three little-endian unsigned integers + // that represent durations at different granularities of time. The first stores a number in months, the second + // stores a number in days, and the third stores a number in milliseconds. + case "duration": + dt = arrow.FixedWidthTypes.MonthDayNanoInterval + } + n.arrowField = buildArrowField(n, dt, arrow.Metadata{}) +} diff --git a/go/arrow/avro/schema_test.go b/go/arrow/avro/schema_test.go new file mode 100644 index 0000000000000..08a3fe1ed7440 --- /dev/null +++ b/go/arrow/avro/schema_test.go @@ -0,0 +1,362 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package avro + +import ( + "fmt" + "testing" + + "github.com/apache/arrow/go/v15/arrow" + hamba "github.com/hamba/avro/v2" +) + +func TestSchemaStringEqual(t *testing.T) { + tests := []struct { + avroSchema string + arrowSchema []arrow.Field + }{ + { + avroSchema: `{ + "fields": [ + { + "name": "inheritNull", + "type": { + "name": "Simple", + "symbols": [ + "a", + "b" + ], + "type": "enum" + } + }, + { + "name": "explicitNamespace", + "type": { + "name": "test", + "namespace": "org.hamba.avro", + "size": 12, + "type": "fixed" + } + }, + { + "name": "fullName", + "type": { + "type": "record", + "name": "fullName_data", + "namespace": "ignored", + "doc": "A name attribute with a fullname, so the namespace attribute is ignored. The fullname is 'a.full.Name', and the namespace is 'a.full'.", + "fields": [{ + "name": "inheritNamespace", + "type": { + "type": "enum", + "name": "Understanding", + "doc": "A simple name (attribute) and no namespace attribute: inherit the namespace of the enclosing type 'a.full.Name'. The fullname is 'a.full.Understanding'.", + "symbols": ["d", "e"] + } + }, { + "name": "md5", + "type": { + "name": "md5_data", + "type": "fixed", + "size": 16, + "namespace": "ignored" + } + } + ] + } + }, + { + "name": "id", + "type": "int" + }, + { + "name": "bigId", + "type": "long" + }, + { + "name": "temperature", + "type": [ + "null", + "float" + ] + }, + { + "name": "fraction", + "type": [ + "null", + "double" + ] + }, + { + "name": "is_emergency", + "type": "boolean" + }, + { + "name": "remote_ip", + "type": [ + "null", + "bytes" + ] + }, + { + "name": "person", + "type": { + "fields": [ + { + "name": "lastname", + "type": "string" + }, + { + "name": "address", + "type": { + "fields": [ + { + "name": "streetaddress", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ], + "name": "AddressUSRecord", + "type": "record" + } + }, + { + "name": "mapfield", + "type": { + "default": { + }, + "type": "map", + "values": "long" + } + }, + { + "name": "arrayField", + "type": { + "default": [ + ], + "items": "string", + "type": "array" + } + } + ], + "name": "person_data", + "type": "record" + } + }, + { + "name": "decimalField", + "type": { + "logicalType": "decimal", + "precision": 4, + "scale": 2, + "type": "bytes" + } + }, + { + "logicalType": "uuid", + "name": "uuidField", + "type": "string" + }, + { + "name": "timemillis", + "type": { + "type": "int", + "logicalType": "time-millis" + } + }, + { + "name": "timemicros", + "type": { + "type": "long", + "logicalType": "time-micros" + } + }, + { + "name": "timestampmillis", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "timestampmicros", + "type": { + "type": "long", + "logicalType": "timestamp-micros" + } + }, + { + "name": "duration", + "type": { + "name": "duration", + "namespace": "whyowhy", + "logicalType": "duration", + "size": 12, + "type": "fixed" + } + }, + { + "name": "date", + "type": { + "logicalType": "date", + "type": "int" + } + } + ], + "name": "Example", + "type": "record" + }`, + arrowSchema: []arrow.Field{ + { + Name: "inheritNull", + Type: &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint8, ValueType: arrow.BinaryTypes.String, Ordered: false}, + Metadata: arrow.MetadataFrom(map[string]string{"0": "a", "1": "b"}), + }, + { + Name: "explicitNamespace", + Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, + }, + { + Name: "fullName", + Type: arrow.StructOf( + arrow.Field{ + Name: "inheritNamespace", + Type: &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint8, ValueType: arrow.BinaryTypes.String, Ordered: false}, + }, + arrow.Field{ + Name: "md5", + Type: &arrow.FixedSizeBinaryType{ByteWidth: 16}, + }, + ), + }, + { + Name: "id", + Type: arrow.PrimitiveTypes.Int32, + }, + { + Name: "bigId", + Type: arrow.PrimitiveTypes.Int64, + }, + { + Name: "temperature", + Type: arrow.PrimitiveTypes.Float32, + Nullable: true, + }, + { + Name: "fraction", + Type: arrow.PrimitiveTypes.Float64, + Nullable: true, + }, + { + Name: "is_emergency", + Type: arrow.FixedWidthTypes.Boolean, + }, + { + Name: "remote_ip", + Type: arrow.BinaryTypes.Binary, + Nullable: true, + }, + { + Name: "person", + Type: arrow.StructOf( + arrow.Field{ + Name: "lastname", + Type: arrow.BinaryTypes.String, + Nullable: true, + }, + arrow.Field{ + Name: "address", + Type: arrow.StructOf( + arrow.Field{ + Name: "streetaddress", + Type: arrow.BinaryTypes.String, + }, + arrow.Field{ + Name: "city", + Type: arrow.BinaryTypes.String, + }, + ), + }, + arrow.Field{ + Name: "mapfield", + Type: arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int64), + Nullable: true, + }, + arrow.Field{ + Name: "arrayField", + Type: arrow.ListOfNonNullable(arrow.BinaryTypes.String), + }, + ), + }, + { + Name: "decimalField", + Type: &arrow.Decimal128Type{Precision: 4, Scale: 2}, + }, + { + Name: "uuidField", + Type: arrow.BinaryTypes.String, + }, + { + Name: "timemillis", + Type: arrow.FixedWidthTypes.Time32ms, + }, + { + Name: "timemicros", + Type: arrow.FixedWidthTypes.Time64us, + }, + { + Name: "timestampmillis", + Type: arrow.FixedWidthTypes.Timestamp_ms, + }, + { + Name: "timestampmicros", + Type: arrow.FixedWidthTypes.Timestamp_us, + }, + { + Name: "duration", + Type: arrow.FixedWidthTypes.MonthDayNanoInterval, + }, + { + Name: "date", + Type: arrow.FixedWidthTypes.Date32, + }, + }, + }, + } + + for _, test := range tests { + t.Run("", func(t *testing.T) { + want := arrow.NewSchema(test.arrowSchema, nil) + schema, err := hamba.ParseBytes([]byte(test.avroSchema)) + if err != nil { + t.Fatalf("%v", err) + } + got, err := ArrowSchemaFromAvro(schema) + if err != nil { + t.Fatalf("%v", err) + } + if !(fmt.Sprintf("%+v", want.String()) == fmt.Sprintf("%+v", got.String())) { + t.Fatalf("got=%v,\n want=%v", got.String(), want.String()) + } else { + t.Logf("schema.String() comparison passed") + } + }) + } +} diff --git a/go/arrow/avro/testdata/arrayrecordmap.avro b/go/arrow/avro/testdata/arrayrecordmap.avro new file mode 100644 index 0000000000000..84a8b59b427b5 Binary files /dev/null and b/go/arrow/avro/testdata/arrayrecordmap.avro differ diff --git a/go/arrow/avro/testdata/githubsamplecommits.avro b/go/arrow/avro/testdata/githubsamplecommits.avro new file mode 100644 index 0000000000000..f16d17d29e991 Binary files /dev/null and b/go/arrow/avro/testdata/githubsamplecommits.avro differ diff --git a/go/arrow/bitutil/bitmaps.go b/go/arrow/bitutil/bitmaps.go index d7516771def7f..887a1920bc933 100644 --- a/go/arrow/bitutil/bitmaps.go +++ b/go/arrow/bitutil/bitmaps.go @@ -360,7 +360,7 @@ func (bm *BitmapWordWriter) PutNextTrailingByte(b byte, validBits int) { bm.bitmap = bm.bitmap[1:] } else { debug.Assert(validBits > 0 && validBits < 8, "invalid valid bits in bitmap word writer") - debug.Assert(BytesForBits(int64(bm.offset+validBits)) <= int64(len(bm.bitmap)), "writing trailiing byte outside of bounds of bitmap") + debug.Assert(BytesForBits(int64(bm.offset+validBits)) <= int64(len(bm.bitmap)), "writing trailing byte outside of bounds of bitmap") wr := NewBitmapWriter(bm.bitmap, int(bm.offset), validBits) for i := 0; i < validBits; i++ { if b&0x01 != 0 { diff --git a/go/arrow/cdata/cdata_exports.go b/go/arrow/cdata/cdata_exports.go index 91f1b352e0327..d5fdc0dac15df 100644 --- a/go/arrow/cdata/cdata_exports.go +++ b/go/arrow/cdata/cdata_exports.go @@ -274,7 +274,7 @@ func (exp *schemaExporter) export(field arrow.Field) { exp.dict = new(schemaExporter) exp.dict.export(arrow.Field{Type: dt.ValueType}) case arrow.NestedType: - exp.children = make([]schemaExporter, len(dt.Fields())) + exp.children = make([]schemaExporter, dt.NumFields()) for i, f := range dt.Fields() { exp.children[i].export(f) } diff --git a/go/arrow/cdata/interface.go b/go/arrow/cdata/interface.go index 8ce06280a0bf5..d55a068aa1564 100644 --- a/go/arrow/cdata/interface.go +++ b/go/arrow/cdata/interface.go @@ -266,7 +266,7 @@ func ExportArrowArray(arr arrow.Array, out *CArrowArray, outSchema *CArrowSchema // ExportRecordReader populates the CArrowArrayStream that is passed in with the appropriate // callbacks to be a working ArrowArrayStream utilizing the passed in RecordReader. The // CArrowArrayStream takes ownership of the RecordReader until the consumer calls the release -// callback, as such it is unnecesary to call Release on the passed in reader unless it has +// callback, as such it is unnecessary to call Release on the passed in reader unless it has // previously been retained. // // WARNING: the output ArrowArrayStream MUST BE ZERO INITIALIZED, or the Go garbage diff --git a/go/arrow/compare_test.go b/go/arrow/compare_test.go index 170fc2d852a36..62e30e634ed0b 100644 --- a/go/arrow/compare_test.go +++ b/go/arrow/compare_test.go @@ -116,13 +116,13 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, false, true, }, @@ -131,13 +131,13 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, false, false, }, @@ -146,13 +146,13 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f0", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f0": []int{0}}, + index: map[string][]int{"f0": {0}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, false, false, }, @@ -161,14 +161,14 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, false, true, }, @@ -177,14 +177,14 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, false, false, }, @@ -193,13 +193,13 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, }, &StructType{ fields: []Field{ {Name: "f2", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f2": []int{0}}, + index: map[string][]int{"f2": {0}}, }, false, false, }, @@ -209,14 +209,14 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, true, false, }, @@ -226,14 +226,14 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, true, false, }, @@ -243,7 +243,7 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, meta: MetadataFrom(map[string]string{"k1": "v1", "k2": "v2"}), }, &StructType{ @@ -251,7 +251,7 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, meta: MetadataFrom(map[string]string{"k2": "v2", "k1": "v1"}), }, true, true, @@ -261,14 +261,14 @@ func TestTypeEqual(t *testing.T) { fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, meta: MetadataFrom(map[string]string{"k1": "v1"}), }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0}}, + index: map[string][]int{"f1": {0}}, meta: MetadataFrom(map[string]string{"k1": "v2"}), }, true, false, @@ -279,14 +279,14 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true, Metadata: MetadataFrom(map[string]string{"k1": "v1"})}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true, Metadata: MetadataFrom(map[string]string{"k1": "v2"})}, {Name: "f2", Type: PrimitiveTypes.Float32, Nullable: false}, }, - index: map[string][]int{"f1": []int{0}, "f2": []int{1}}, + index: map[string][]int{"f1": {0}, "f2": {1}}, }, false, true, }, @@ -296,14 +296,14 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0, 1}}, + index: map[string][]int{"f1": {0, 1}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0, 1}}, + index: map[string][]int{"f1": {0, 1}}, }, true, true, }, @@ -313,14 +313,14 @@ func TestTypeEqual(t *testing.T) { {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, }, - index: map[string][]int{"f1": []int{0, 1}}, + index: map[string][]int{"f1": {0, 1}}, }, &StructType{ fields: []Field{ {Name: "f1", Type: PrimitiveTypes.Uint16, Nullable: true}, {Name: "f1", Type: PrimitiveTypes.Uint32, Nullable: true}, }, - index: map[string][]int{"f1": []int{0, 1}}, + index: map[string][]int{"f1": {0, 1}}, }, false, true, }, @@ -343,7 +343,6 @@ func TestTypeEqual(t *testing.T) { MapOf(BinaryTypes.String, &TimestampType{ Unit: 0, TimeZone: "UTC", - loc: nil, }), true, false, }, diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go index 821ffd9e068d4..c9c3f1ceb03e9 100644 --- a/go/arrow/compute/arithmetic_test.go +++ b/go/arrow/compute/arithmetic_test.go @@ -393,7 +393,7 @@ func (b *BinaryArithmeticSuite[T]) TestSub() { }) } -func (b *BinaryArithmeticSuite[T]) TestMuliply() { +func (b *BinaryArithmeticSuite[T]) TestMultiply() { b.Run(b.DataType().String(), func() { for _, overflow := range []bool{false, true} { b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() { diff --git a/go/arrow/compute/cast.go b/go/arrow/compute/cast.go index 133a983038ce1..385c0e6858968 100644 --- a/go/arrow/compute/cast.go +++ b/go/arrow/compute/cast.go @@ -266,8 +266,8 @@ func CastStruct(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) opts = ctx.State.(kernels.CastState) inType = batch.Values[0].Array.Type.(*arrow.StructType) outType = out.Type.(*arrow.StructType) - inFieldCount = len(inType.Fields()) - outFieldCount = len(outType.Fields()) + inFieldCount = inType.NumFields() + outFieldCount = outType.NumFields() ) fieldsToSelect := make([]int, outFieldCount) diff --git a/go/arrow/compute/datum.go b/go/arrow/compute/datum.go index 98bd1f1a0a326..388cfa10156d5 100644 --- a/go/arrow/compute/datum.go +++ b/go/arrow/compute/datum.go @@ -250,7 +250,7 @@ func (d *TableDatum) Equals(other Datum) bool { // an array.Table gets a TableDatum // a scalar.Scalar gets a ScalarDatum // -// Anything else is passed to scalar.MakeScalar and recieves a scalar +// Anything else is passed to scalar.MakeScalar and receives a scalar // datum of that appropriate type. func NewDatum(value interface{}) Datum { switch v := value.(type) { diff --git a/go/arrow/compute/doc.go b/go/arrow/compute/doc.go index 53a164e61e514..7c763cb18d0ff 100644 --- a/go/arrow/compute/doc.go +++ b/go/arrow/compute/doc.go @@ -23,7 +23,7 @@ // is an attempt to provide for those users, and in general create a // native-go arrow compute engine. // -// The overwhemling majority of things in this package require go1.18 as +// The overwhelming majority of things in this package require go1.18 as // it utilizes generics. The files in this package and its sub-packages // are all excluded from being built by go versions lower than 1.18 so // that the larger Arrow module itself is still compatible with go1.17. diff --git a/go/arrow/compute/exec/span.go b/go/arrow/compute/exec/span.go index 0b5f6208227f5..55753de9e0e73 100644 --- a/go/arrow/compute/exec/span.go +++ b/go/arrow/compute/exec/span.go @@ -633,7 +633,7 @@ func FillZeroLength(dt arrow.DataType, span *ArraySpan) { return } - span.resizeChildren(len(nt.Fields())) + span.resizeChildren(nt.NumFields()) for i, f := range nt.Fields() { FillZeroLength(f.Type, &span.Children[i]) } diff --git a/go/arrow/compute/exec/utils.go b/go/arrow/compute/exec/utils.go index 17bc30ef2adb0..276e4570ca968 100644 --- a/go/arrow/compute/exec/utils.go +++ b/go/arrow/compute/exec/utils.go @@ -194,7 +194,7 @@ func GetDataType[T NumericTypes | bool | string | []byte | float16.Num]() arrow. return typMap[reflect.TypeOf(z)] } -// GetType returns the appropriate arrow.Type type T, only for non-parameteric +// GetType returns the appropriate arrow.Type type T, only for non-parametric // types. This uses a map and reflection internally so don't call this in // a tight loop, instead call it once and then use a closure with the result. func GetType[T NumericTypes | bool | string]() arrow.Type { diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go index db89b206daf5f..b8144a4a8de87 100644 --- a/go/arrow/compute/executor.go +++ b/go/arrow/compute/executor.go @@ -391,7 +391,7 @@ func inferBatchLength(values []Datum) (length int64, allSame bool) { type KernelExecutor interface { // Init must be called *after* the kernel's init method and any // KernelState must be set into the KernelCtx *before* calling - // this Init method. This is to faciliate the case where + // this Init method. This is to facilitate the case where // Init may be expensive and does not need to be called // again for each execution of the kernel. For example, // the same lookup table can be re-used for all scanned batches diff --git a/go/arrow/compute/exprs/builders.go b/go/arrow/compute/exprs/builders.go index 0b694525d1b47..ebe1d4c27385b 100644 --- a/go/arrow/compute/exprs/builders.go +++ b/go/arrow/compute/exprs/builders.go @@ -162,7 +162,7 @@ func NewFieldRefFromDotPath(dotpath string, rootSchema *arrow.Schema) (expr.Refe idx, _ := strconv.Atoi(dotpath[:subend]) switch ct := curType.(type) { case *arrow.StructType: - if idx > len(ct.Fields()) { + if idx > ct.NumFields() { return nil, fmt.Errorf("%w: field out of bounds in dotpath", arrow.ErrIndex) } curType = ct.Field(idx).Type diff --git a/go/arrow/compute/exprs/exec.go b/go/arrow/compute/exprs/exec.go index 800ffe62f2559..d43703f5c1c8b 100644 --- a/go/arrow/compute/exprs/exec.go +++ b/go/arrow/compute/exprs/exec.go @@ -53,7 +53,7 @@ func makeExecBatch(ctx context.Context, schema *arrow.Schema, partial compute.Da partialBatch := partial.(*compute.RecordDatum).Value batchSchema := partialBatch.Schema() - out.Values = make([]compute.Datum, len(schema.Fields())) + out.Values = make([]compute.Datum, schema.NumFields()) out.Len = partialBatch.NumRows() for i, field := range schema.Fields() { @@ -99,7 +99,7 @@ func makeExecBatch(ctx context.Context, schema *arrow.Schema, partial compute.Da return makeExecBatch(ctx, schema, compute.NewDatumWithoutOwning(batch)) case *compute.ScalarDatum: out.Len = 1 - out.Values = make([]compute.Datum, len(schema.Fields())) + out.Values = make([]compute.Datum, schema.NumFields()) s := part.Value.(*scalar.Struct) dt := s.Type.(*arrow.StructType) diff --git a/go/arrow/compute/exprs/types.go b/go/arrow/compute/exprs/types.go index 6a5b81d11b3eb..87e08233cc00a 100644 --- a/go/arrow/compute/exprs/types.go +++ b/go/arrow/compute/exprs/types.go @@ -35,7 +35,7 @@ const ( // URI for official Arrow Substrait Extension Types ArrowExtTypesUri = "https://github.com/apache/arrow/blob/main/format/substrait/extension_types.yaml" SubstraitDefaultURIPrefix = extensions.SubstraitDefaultURIPrefix - // URI for official Substrait Arithemetic funcs extensions + // URI for official Substrait Arithmetic funcs extensions SubstraitArithmeticFuncsURI = SubstraitDefaultURIPrefix + "functions_arithmetic.yaml" // URI for official Substrait Comparison funcs extensions SubstraitComparisonFuncsURI = SubstraitDefaultURIPrefix + "functions_comparison.yaml" @@ -235,7 +235,7 @@ func encodeOptionlessOverflowableArithmetic(id extensions.ID) arrowToSubstrait { } } -// NewExtensionSetDefault is a convenince function to create a new extension +// NewExtensionSetDefault is a convenience function to create a new extension // set using the Default arrow extension ID registry. // // See NewExtensionSet for more info. @@ -594,7 +594,7 @@ func ToSubstraitType(dt arrow.DataType, nullable bool, ext ExtensionIDSet) (type Precision: dt.GetPrecision(), Scale: dt.GetScale()}, nil case arrow.STRUCT: dt := dt.(*arrow.StructType) - fields := make([]types.Type, len(dt.Fields())) + fields := make([]types.Type, dt.NumFields()) var err error for i, f := range dt.Fields() { fields[i], err = ToSubstraitType(f.Type, f.Nullable, ext) diff --git a/go/arrow/compute/fieldref_test.go b/go/arrow/compute/fieldref_test.go index c4fa72182835f..72985012e4a92 100644 --- a/go/arrow/compute/fieldref_test.go +++ b/go/arrow/compute/fieldref_test.go @@ -44,7 +44,7 @@ func TestFieldPathBasics(t *testing.T) { assert.Nil(t, f) assert.ErrorIs(t, err, compute.ErrEmpty) - f, err = compute.FieldPath{len(s.Fields()) * 2}.Get(s) + f, err = compute.FieldPath{s.NumFields() * 2}.Get(s) assert.Nil(t, f) assert.ErrorIs(t, err, compute.ErrIndexRange) } @@ -63,7 +63,7 @@ func TestFieldRefBasics(t *testing.T) { } // out of range index results in failure to match - assert.Empty(t, compute.FieldRefIndex(len(s.Fields())*2).FindAll(s.Fields())) + assert.Empty(t, compute.FieldRefIndex(s.NumFields()*2).FindAll(s.Fields())) // lookup by name returns the indices of both matching fields assert.Equal(t, []compute.FieldPath{{0}, {2}}, compute.FieldRefName("alpha").FindAll(s.Fields())) diff --git a/go/arrow/compute/functions.go b/go/arrow/compute/functions.go index b85062721d9b8..f35d9facaf2d3 100644 --- a/go/arrow/compute/functions.go +++ b/go/arrow/compute/functions.go @@ -179,7 +179,7 @@ func (b *baseFunction) checkArity(nargs int) error { return nil } -// kernelType is a type contstraint interface that is used for funcImpl +// kernelType is a type constraint interface that is used for funcImpl // generic definitions. It will be extended as other kernel types // are defined. // @@ -227,7 +227,7 @@ func (fi *funcImpl[KT]) Kernels() []*KT { // A ScalarFunction is a function that executes element-wise operations // on arrays or scalars, and therefore whose results generally do not -// depent on the order of the values in the arguments. Accepts and returns +// depend on the order of the values in the arguments. Accepts and returns // arrays that are all of the same size. These functions roughly correspond // to the functions used in most SQL expressions. type ScalarFunction struct { diff --git a/go/arrow/compute/internal/kernels/base_arithmetic.go b/go/arrow/compute/internal/kernels/base_arithmetic.go index c7950877264df..4ef0031f31484 100644 --- a/go/arrow/compute/internal/kernels/base_arithmetic.go +++ b/go/arrow/compute/internal/kernels/base_arithmetic.go @@ -795,7 +795,7 @@ func getArithmeticOpDecimalImpl[T decimal128.Num | decimal256.Num](op Arithmetic return int64(fns.Sign(arg)) }) } - debug.Assert(false, "unimplemented arithemtic op") + debug.Assert(false, "unimplemented arithmetic op") return nil } diff --git a/go/arrow/compute/internal/kernels/rounding.go b/go/arrow/compute/internal/kernels/rounding.go index 8a1bec1180ac8..345c779085fe8 100644 --- a/go/arrow/compute/internal/kernels/rounding.go +++ b/go/arrow/compute/internal/kernels/rounding.go @@ -619,7 +619,7 @@ func (rnd *roundToMultipleDec[T]) call(_ *exec.KernelCtx, arg T, e *error) T { if rnd.mode >= HalfDown { if rnd.hasHalfwayPoint && (remainder == rnd.halfMult || remainder == rnd.negHalfMult) { // on the halfway point, use tiebreaker - // manually implement rounding since we're not actually rounding + // manually implement rounding since we aren't actually rounding // a decimal value, but rather manipulating the multiple switch rnd.mode { case HalfDown: @@ -666,7 +666,7 @@ func (rnd *roundToMultipleDec[T]) call(_ *exec.KernelCtx, arg T, e *error) T { } } } else { - // manually implement rounding since we're not actually rounding + // manually implement rounding since we aren't actually rounding // a decimal value, but rather manipulating the multiple switch rnd.mode { case RoundDown: diff --git a/go/arrow/compute/internal/kernels/vector_run_end_encode.go b/go/arrow/compute/internal/kernels/vector_run_end_encode.go index a147bf7d50170..076bef1368438 100644 --- a/go/arrow/compute/internal/kernels/vector_run_end_encode.go +++ b/go/arrow/compute/internal/kernels/vector_run_end_encode.go @@ -394,7 +394,7 @@ func (re *runEndEncodeLoopBinary[R, O]) WriteEncodedRuns(out *exec.ExecResult) i outputValues := out.Children[1].Buffers[2].Buf // re.offsetValues already accounts for the input.offset so we don't - // need to initalize readOffset to re.inputOffset + // need to initialize readOffset to re.inputOffset var readOffset int64 currentRun, curRunValid := re.readValue(readOffset) readOffset++ diff --git a/go/arrow/compute/registry_test.go b/go/arrow/compute/registry_test.go index 4e4f44f1d39b6..b725091090434 100644 --- a/go/arrow/compute/registry_test.go +++ b/go/arrow/compute/registry_test.go @@ -81,7 +81,7 @@ func TestRegistryBasics(t *testing.T) { assert.True(t, ok) assert.Same(t, fn, f1) - // non-existent + // nonexistent _, ok = registry.GetFunction("f2") assert.False(t, ok) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 2a1f7300a986f..99dac29f4d728 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -172,7 +172,7 @@ func WithNullWriter(null string) Option { } // WithBoolWriter override the default bool formatter with a function that returns -// a string representaton of bool states. i.e. True, False, 1, 0 +// a string representation of bool states. i.e. True, False, 1, 0 func WithBoolWriter(fmtr func(bool) string) Option { return func(cfg config) { switch cfg := cfg.(type) { diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index 1918e2e492dff..644cae0933f4c 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -316,7 +316,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo t.Fatal(err) } - bdata, err := expectedOutout(data) + bdata, err := expectedOutput(data) if err != nil { t.Fatal(err) } @@ -326,7 +326,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo } } -func expectedOutout(data [][]string) (*bytes.Buffer, error) { +func expectedOutput(data [][]string) (*bytes.Buffer, error) { b := bytes.NewBuffer(nil) w := ecsv.NewWriter(b) w.Comma = separator diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index 1e5d8fb98aa59..b2f2329e5ccaa 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -144,7 +144,7 @@ const ( // like BINARY but with 64-bit offsets, not yet implemented LARGE_BINARY - // like LIST but with 64-bit offsets. not yet implmented + // like LIST but with 64-bit offsets. not yet implemented LARGE_LIST // calendar interval with three fields diff --git a/go/arrow/datatype_encoded.go b/go/arrow/datatype_encoded.go index c1750a8894f43..749f03a582646 100644 --- a/go/arrow/datatype_encoded.go +++ b/go/arrow/datatype_encoded.go @@ -58,6 +58,8 @@ func (t *RunEndEncodedType) Fields() []Field { } } +func (t *RunEndEncodedType) NumFields() int { return 2 } + func (*RunEndEncodedType) ValidRunEndsType(dt DataType) bool { switch dt.ID() { case INT16, INT32, INT64: diff --git a/go/arrow/datatype_extension.go b/go/arrow/datatype_extension.go index 271c8b0dbc789..f0bcccdf2bffc 100644 --- a/go/arrow/datatype_extension.go +++ b/go/arrow/datatype_extension.go @@ -46,7 +46,7 @@ func getExtTypeRegistry() *sync.Map { } // RegisterExtensionType registers the provided ExtensionType by calling ExtensionName -// to use as a Key for registrying the type. If a type with the same name is already +// to use as a Key for registering the type. If a type with the same name is already // registered then this will return an error saying so, otherwise it will return nil // if successful registering the type. // This function is safe to call from multiple goroutines simultaneously. @@ -117,7 +117,7 @@ type ExtensionType interface { // concurrently. Serialize() string // Deserialize is called when reading in extension arrays and types via the IPC format - // in order to construct an instance of the appropriate extension type. The data passed in + // in order to construct an instance of the appropriate extension type. The passed in data // is pulled from the ARROW:extension:metadata key and may be nil or an empty slice. // If the storage type is incorrect or something else is invalid with the data this should // return nil and an appropriate error. @@ -161,6 +161,13 @@ func (e *ExtensionBase) Fields() []Field { return nil } +func (e *ExtensionBase) NumFields() int { + if nested, ok := e.Storage.(NestedType); ok { + return nested.NumFields() + } + return 0 +} + func (e *ExtensionBase) Layout() DataTypeLayout { return e.Storage.Layout() } // this no-op exists to ensure that this type must be embedded in any user-defined extension type. diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index bcbc8ef6aec87..1a3074e59e75f 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -19,6 +19,7 @@ package arrow import ( "fmt" "strconv" + "sync" "time" "github.com/apache/arrow/go/v15/internal/json" @@ -354,6 +355,7 @@ type TimestampType struct { TimeZone string loc *time.Location + mx sync.RWMutex } func (*TimestampType) ID() Type { return TIMESTAMP } @@ -386,6 +388,8 @@ func (t *TimestampType) TimeUnit() TimeUnit { return t.Unit } // This should be called if you change the value of the TimeZone after having // potentially called GetZone. func (t *TimestampType) ClearCachedLocation() { + t.mx.Lock() + defer t.mx.Unlock() t.loc = nil } @@ -398,10 +402,20 @@ func (t *TimestampType) ClearCachedLocation() { // so if you change the value of TimeZone after calling this, make sure to call // ClearCachedLocation. func (t *TimestampType) GetZone() (*time.Location, error) { + t.mx.RLock() if t.loc != nil { + defer t.mx.RUnlock() return t.loc, nil } + t.mx.RUnlock() + t.mx.Lock() + defer t.mx.Unlock() + // in case GetZone() was called in between releasing the read lock and + // getting the write lock + if t.loc != nil { + return t.loc, nil + } // the TimeZone string is allowed to be either a valid tzdata string // such as "America/New_York" or an absolute offset of the form -XX:XX // or +XX:XX @@ -415,7 +429,7 @@ func (t *TimestampType) GetZone() (*time.Location, error) { if loc, err := time.LoadLocation(t.TimeZone); err == nil { t.loc = loc - return t.loc, err + return loc, err } // at this point we know that the timezone isn't empty, and didn't match diff --git a/go/arrow/datatype_fixedwidth_test.go b/go/arrow/datatype_fixedwidth_test.go index 918572d40b8f4..b3cbb465f3db6 100644 --- a/go/arrow/datatype_fixedwidth_test.go +++ b/go/arrow/datatype_fixedwidth_test.go @@ -17,6 +17,7 @@ package arrow_test import ( + "sync" "testing" "time" @@ -180,6 +181,30 @@ func TestTimestampType_GetToTimeFunc(t *testing.T) { assert.Equal(t, "2345-12-29T19:00:00-05:00", toTimeNY(ts).Format(time.RFC3339)) } +// Test race condition from GH-38795 +func TestGetToTimeFuncRace(t *testing.T) { + var ( + wg sync.WaitGroup + w = make(chan bool) + routineNum = 10 + ) + + wg.Add(routineNum) + for i := 0; i < routineNum; i++ { + go func() { + defer wg.Done() + + <-w + + _, _ = arrow.FixedWidthTypes.Timestamp_s.(*arrow.TimestampType).GetToTimeFunc() + }() + } + + close(w) + + wg.Wait() +} + func TestTime32Type(t *testing.T) { for _, tc := range []struct { unit arrow.TimeUnit diff --git a/go/arrow/datatype_nested.go b/go/arrow/datatype_nested.go index e381cd7047e45..9a8873a50eb1a 100644 --- a/go/arrow/datatype_nested.go +++ b/go/arrow/datatype_nested.go @@ -32,6 +32,8 @@ type ( // Fields method provides a copy of NestedType fields // (so it can be safely mutated and will not result in updating the NestedType). Fields() []Field + // NumFields provides the number of fields without allocating. + NumFields() int } ListLikeType interface { @@ -109,6 +111,8 @@ func (t *ListType) ElemField() Field { func (t *ListType) Fields() []Field { return []Field{t.ElemField()} } +func (t *ListType) NumFields() int { return 1 } + func (*ListType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int32SizeBytes)}} } @@ -242,6 +246,8 @@ func (t *FixedSizeListType) Fingerprint() string { func (t *FixedSizeListType) Fields() []Field { return []Field{t.ElemField()} } +func (t *FixedSizeListType) NumFields() int { return 1 } + func (*FixedSizeListType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap()}} } @@ -308,6 +314,8 @@ func (t *ListViewType) ElemField() Field { func (t *ListViewType) Fields() []Field { return []Field{t.ElemField()} } +func (t *ListViewType) NumFields() int { return 1 } + func (*ListViewType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int32SizeBytes), SpecFixedWidth(Int32SizeBytes)}} } @@ -376,6 +384,8 @@ func (t *LargeListViewType) ElemField() Field { func (t *LargeListViewType) Fields() []Field { return []Field{t.ElemField()} } +func (t *LargeListViewType) NumFields() int { return 1 } + func (*LargeListViewType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int64SizeBytes), SpecFixedWidth(Int64SizeBytes)}} } @@ -447,6 +457,8 @@ func (t *StructType) Fields() []Field { return fields } +func (t *StructType) NumFields() int { return len(t.fields) } + func (t *StructType) Field(i int) Field { return t.fields[i] } // FieldByName gets the field with the given name. @@ -464,7 +476,7 @@ func (t *StructType) FieldByName(name string) (Field, bool) { // FieldIdx gets the index of the field with the given name. // // If there are multiple fields with the given name, FieldIdx returns -// the index of the first first such field. +// the index of the first such field. func (t *StructType) FieldIdx(name string) (int, bool) { i, ok := t.index[name] if ok { @@ -598,6 +610,8 @@ func (t *MapType) Fingerprint() string { func (t *MapType) Fields() []Field { return []Field{t.ElemField()} } +func (t *MapType) NumFields() int { return 1 } + func (t *MapType) Layout() DataTypeLayout { return t.value.Layout() } @@ -690,6 +704,8 @@ func (t *unionType) Fields() []Field { return fields } +func (t *unionType) NumFields() int { return len(t.children) } + func (t *unionType) TypeCodes() []UnionTypeCode { return t.typeCodes } func (t *unionType) ChildIDs() []int { return t.childIDs[:] } diff --git a/go/arrow/datatype_nested_test.go b/go/arrow/datatype_nested_test.go index 1e09f49da147c..a1daa8e58df31 100644 --- a/go/arrow/datatype_nested_test.go +++ b/go/arrow/datatype_nested_test.go @@ -205,7 +205,7 @@ func TestStructOf(t *testing.T) { t.Fatalf("invalid name. got=%q, want=%q", got, want) } - if got, want := len(got.Fields()), len(tc.fields); got != want { + if got, want := got.NumFields(), len(tc.fields); got != want { t.Fatalf("invalid number of fields. got=%d, want=%d", got, want) } diff --git a/go/arrow/decimal128/decimal128.go b/go/arrow/decimal128/decimal128.go index 3b88dce1fa809..7ce8cd51b0717 100644 --- a/go/arrow/decimal128/decimal128.go +++ b/go/arrow/decimal128/decimal128.go @@ -237,7 +237,7 @@ func FromString(v string, prec, scale int32) (n Num, err error) { // math/big library refers to precision in floating point terms // where it refers to the "number of bits of precision in the mantissa". // So we need to figure out how many bits we should use for precision, - // based on the input precision. Too much precision and we're not rounding + // based on the input precision. Too much precision and we aren't rounding // when we should. Too little precision and we round when we shouldn't. // // In general, the number of decimal digits you get from a given number diff --git a/go/arrow/decimal128/decimal128_test.go b/go/arrow/decimal128/decimal128_test.go index 4cfd7db20db08..05af1f557f1f9 100644 --- a/go/arrow/decimal128/decimal128_test.go +++ b/go/arrow/decimal128/decimal128_test.go @@ -492,7 +492,7 @@ func TestDecimalToReal(t *testing.T) { checkDecimalToFloat(t, "9223373136366403584", float64(9.223373136366404e+18), 0) checkDecimalToFloat(t, "-9223373136366403584", float64(-9.223373136366404e+18), 0) - // 2**64 - 2**11 (exactly represntable in a float64) + // 2**64 - 2**11 (exactly representable in a float64) checkDecimalToFloat(t, "18446746272732807168", float64(1.8446746272732807e+19), 0) checkDecimalToFloat(t, "-18446746272732807168", float64(-1.8446746272732807e+19), 0) diff --git a/go/arrow/decimal256/decimal256.go b/go/arrow/decimal256/decimal256.go index 5f2ad5f32165c..65be0df9dd424 100644 --- a/go/arrow/decimal256/decimal256.go +++ b/go/arrow/decimal256/decimal256.go @@ -125,7 +125,7 @@ func FromString(v string, prec, scale int32) (n Num, err error) { // math/big library refers to precision in floating point terms // where it refers to the "number of bits of precision in the mantissa". // So we need to figure out how many bits we should use for precision, - // based on the input precision. Too much precision and we're not rounding + // based on the input precision. Too much precision and we aren't rounding // when we should. Too little precision and we round when we shouldn't. // // In general, the number of decimal digits you get from a given number diff --git a/go/arrow/encoded/ree_utils_test.go b/go/arrow/encoded/ree_utils_test.go index 9470331002b80..57d11e9b4edbe 100644 --- a/go/arrow/encoded/ree_utils_test.go +++ b/go/arrow/encoded/ree_utils_test.go @@ -84,9 +84,9 @@ func TestMergedRunsIter(t *testing.T) { expectedRunLengths = []int32{5, 4, 6, 5, 5, 25} expectedLeftVisits = []int32{110, 111, 111, 112, 113, 114} expectedRightVisits = []int32{205, 206, 207, 207, 207, 208} - leftPrntOffset int32 = 1000 + leftPrintOffset int32 = 1000 leftChildOffset int32 = 100 - rightPrntOffset int32 = 2000 + rightPrintOffset int32 = 2000 rightChildOffset int32 = 200 leftChild arrow.Array = array.NewNull(int(leftChildOffset) + leftRunEnds.Len()) @@ -101,9 +101,9 @@ func TestMergedRunsIter(t *testing.T) { rightArray := arrow.Array(array.NewRunEndEncodedArray(rightRunEnds, rightChild, 2050, 0)) defer rightArray.Release() - leftArray = array.NewSlice(leftArray, int64(leftPrntOffset), int64(leftArray.Len())) + leftArray = array.NewSlice(leftArray, int64(leftPrintOffset), int64(leftArray.Len())) defer leftArray.Release() - rightArray = array.NewSlice(rightArray, int64(rightPrntOffset), int64(rightArray.Len())) + rightArray = array.NewSlice(rightArray, int64(rightPrintOffset), int64(rightArray.Len())) defer rightArray.Release() pos, logicalPos := 0, 0 diff --git a/go/arrow/flight/client.go b/go/arrow/flight/client.go index 8c400eb66b64d..4f44bdc5ebd58 100644 --- a/go/arrow/flight/client.go +++ b/go/arrow/flight/client.go @@ -265,7 +265,7 @@ func NewFlightClient(addr string, auth ClientAuthHandler, opts ...grpc.DialOptio return &client{conn: conn, FlightServiceClient: flight.NewFlightServiceClient(conn), authHandler: auth}, nil } -// NewClientWithMiddleware takes a slice of middlewares in addition to the auth and address which will be +// NewClientWithMiddleware takes a slice of middleware in addition to the auth and address which will be // used by grpc and chained, the first middleware will be the outer most with the last middleware // being the inner most wrapper around the actual call. It also passes along the dialoptions passed in such // as TLS certs and so on. diff --git a/go/arrow/flight/flightsql/driver/README.md b/go/arrow/flight/flightsql/driver/README.md index f1447a7d24256..fade2cc5c4eec 100644 --- a/go/arrow/flight/flightsql/driver/README.md +++ b/go/arrow/flight/flightsql/driver/README.md @@ -25,7 +25,7 @@ connection pooling, transactions combined with ease of use (see (#usage)). --------------------------------------- -* [Prerequisits](#prerequisits) +* [Prerequisites](#prerequisites) * [Usage](#usage) * [Data Source Name (DSN)](#data-source-name-dsn) * [Driver config usage](#driver-config-usage) @@ -218,7 +218,7 @@ configuration. In this case you need to call `RegisterTLSConfig()` in your code ... ``` -This will register the custom configuration, constraining the minimim TLS +This will register the custom configuration, constraining the minimum TLS version, as `myconfig` and then references the registered configuration by name in the DSN. You can reuse the same TLS configuration by registering once and then reference in multiple DSNs. Registering multiple configurations with diff --git a/go/arrow/flight/flightsql/driver/driver.go b/go/arrow/flight/flightsql/driver/driver.go index e325489236c6d..e31e572586557 100644 --- a/go/arrow/flight/flightsql/driver/driver.go +++ b/go/arrow/flight/flightsql/driver/driver.go @@ -159,7 +159,7 @@ func (s *Stmt) NumInput() int { // If NumInput returns >= 0, the sql package will sanity check argument // counts from callers and return errors to the caller before the // statement's Exec or Query methods are called. - return len(schema.Fields()) + return schema.NumFields() } // Exec executes a query that doesn't return rows, such diff --git a/go/arrow/flight/flightsql/example/sql_batch_reader.go b/go/arrow/flight/flightsql/example/sql_batch_reader.go index 36a0d7b424544..8c87021672de5 100644 --- a/go/arrow/flight/flightsql/example/sql_batch_reader.go +++ b/go/arrow/flight/flightsql/example/sql_batch_reader.go @@ -111,7 +111,7 @@ type SqlBatchReader struct { } func NewSqlBatchReaderWithSchema(mem memory.Allocator, schema *arrow.Schema, rows *sql.Rows) (*SqlBatchReader, error) { - rowdest := make([]interface{}, len(schema.Fields())) + rowdest := make([]interface{}, schema.NumFields()) for i, f := range schema.Fields() { switch f.Type.ID() { case arrow.DENSE_UNION, arrow.SPARSE_UNION: diff --git a/go/arrow/flight/flightsql/example/sqlite_server.go b/go/arrow/flight/flightsql/example/sqlite_server.go index f06dd0210655f..24cadd957e584 100644 --- a/go/arrow/flight/flightsql/example/sqlite_server.go +++ b/go/arrow/flight/flightsql/example/sqlite_server.go @@ -684,7 +684,7 @@ func (s *SQLiteFlightSQLServer) GetFlightInfoPrimaryKeys(_ context.Context, cmd } func (s *SQLiteFlightSQLServer) DoGetPrimaryKeys(ctx context.Context, cmd flightsql.TableRef) (*arrow.Schema, <-chan flight.StreamChunk, error) { - // the field key_name can not be recovered by sqlite so it is + // the field key_name cannot be recovered by sqlite so it is // being set to null following the same pattern for catalog name and schema_name var b strings.Builder diff --git a/go/arrow/flight/flightsql/server.go b/go/arrow/flight/flightsql/server.go index a086610433eae..5b1764707c298 100644 --- a/go/arrow/flight/flightsql/server.go +++ b/go/arrow/flight/flightsql/server.go @@ -274,7 +274,7 @@ type BaseServer struct { sqlInfoToResult SqlInfoResultMap // Alloc allows specifying a particular allocator to use for any // allocations done by the base implementation. - // Will use memory.DefaultAlloctor if nil + // Will use memory.DefaultAllocator if nil Alloc memory.Allocator } @@ -646,7 +646,7 @@ type Server interface { BeginSavepoint(context.Context, ActionBeginSavepointRequest) (id []byte, err error) // EndSavepoint releases or rolls back a savepoint EndSavepoint(context.Context, ActionEndSavepointRequest) error - // EndTransaction commits or rollsback a transaction + // EndTransaction commits or rolls back a transaction EndTransaction(context.Context, ActionEndTransactionRequest) error // CancelFlightInfo attempts to explicitly cancel a FlightInfo CancelFlightInfo(context.Context, *flight.CancelFlightInfoRequest) (flight.CancelFlightInfoResult, error) diff --git a/go/arrow/flight/flightsql/types.go b/go/arrow/flight/flightsql/types.go index 2b7419482e3e3..5a26414d8c232 100644 --- a/go/arrow/flight/flightsql/types.go +++ b/go/arrow/flight/flightsql/types.go @@ -525,7 +525,7 @@ const ( SqlInfoMaxCharLiteralLen = SqlInfo(pb.SqlInfo_SQL_MAX_CHAR_LITERAL_LENGTH) // Retrieves a int64 value representing the maximum number of characters allowed for a column name. SqlInfoMaxColumnNameLen = SqlInfo(pb.SqlInfo_SQL_MAX_COLUMN_NAME_LENGTH) - // Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. + // Retrieves a int64 value representing the maximum number of columns allowed in a GROUP BY clause. SqlInfoMaxColumnsInGroupBy = SqlInfo(pb.SqlInfo_SQL_MAX_COLUMNS_IN_GROUP_BY) // Retrieves a int64 value representing the maximum number of columns allowed in an index. SqlInfoMaxColumnsInIndex = SqlInfo(pb.SqlInfo_SQL_MAX_COLUMNS_IN_INDEX) @@ -705,7 +705,7 @@ const ( // - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] // Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. - SqlInfoSupportedConcurrenciesForResultSetScrollInensitive = SqlInfo(pb.SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE) + SqlInfoSupportedConcurrenciesForResultSetScrollInsensitive = SqlInfo(pb.SqlInfo_SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE) // Retrieves a boolean value indicating whether this database supports batch updates. // diff --git a/go/arrow/flight/gen/flight/FlightSql.pb.go b/go/arrow/flight/gen/flight/FlightSql.pb.go index 494bf8bcca115..279dc29c4262a 100644 --- a/go/arrow/flight/gen/flight/FlightSql.pb.go +++ b/go/arrow/flight/gen/flight/FlightSql.pb.go @@ -421,7 +421,7 @@ const ( SqlInfo_SQL_MAX_CHAR_LITERAL_LENGTH SqlInfo = 542 // Retrieves a int64 value representing the maximum number of characters allowed for a column name. SqlInfo_SQL_MAX_COLUMN_NAME_LENGTH SqlInfo = 543 - // Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. + // Retrieves a int64 value representing the maximum number of columns allowed in a GROUP BY clause. SqlInfo_SQL_MAX_COLUMNS_IN_GROUP_BY SqlInfo = 544 // Retrieves a int64 value representing the maximum number of columns allowed in an index. SqlInfo_SQL_MAX_COLUMNS_IN_INDEX SqlInfo = 545 @@ -1704,7 +1704,7 @@ func (SqlSupportsConvert) EnumDescriptor() ([]byte, []int) { //* // The JDBC/ODBC-defined type of any object. -// All the values here are the sames as in the JDBC and ODBC specs. +// All the values here are the same as in the JDBC and ODBC specs. type XdbcDataType int32 const ( @@ -1965,7 +1965,7 @@ const ( // Indicates that the fields allow the use of null values. Nullable_NULLABILITY_NULLABLE Nullable = 1 //* - // Indicates that nullability of the fields can not be determined. + // Indicates that nullability of the fields cannot be determined. Nullable_NULLABILITY_UNKNOWN Nullable = 2 ) @@ -2014,7 +2014,7 @@ type Searchable int32 const ( //* - // Indicates that column can not be used in a WHERE clause. + // Indicates that column cannot be used in a WHERE clause. Searchable_SEARCHABLE_NONE Searchable = 0 //* // Indicates that the column can be used in a WHERE clause if it is using a @@ -2633,7 +2633,7 @@ func (x *CommandGetDbSchemas) GetDbSchemaFilterPattern() string { // - ARROW:FLIGHT:SQL:PRECISION - Column precision/size // - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable // - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. @@ -3881,7 +3881,7 @@ func (x *ActionEndSavepointRequest) GetAction() ActionEndSavepointRequest_EndSav // - ARROW:FLIGHT:SQL:PRECISION - Column precision/size // - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable // - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // - GetFlightInfo: execute the query. @@ -3954,7 +3954,7 @@ func (x *CommandStatementQuery) GetTransactionId() []byte { // - ARROW:FLIGHT:SQL:PRECISION - Column precision/size // - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable // - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // - GetFlightInfo: execute the query. @@ -4079,7 +4079,7 @@ func (x *TicketStatementQuery) GetStatementHandle() []byte { // - ARROW:FLIGHT:SQL:PRECISION - Column precision/size // - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable // - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. // - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. // - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. @@ -4134,7 +4134,7 @@ func (x *CommandPreparedStatementQuery) GetPreparedStatementHandle() []byte { // // Represents a SQL update query. Used in the command member of FlightDescriptor -// for the the RPC call DoPut to cause the server to execute the included SQL update. +// for the RPC call DoPut to cause the server to execute the included SQL update. type CommandStatementUpdate struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -4194,7 +4194,7 @@ func (x *CommandStatementUpdate) GetTransactionId() []byte { // // Represents a SQL update query. Used in the command member of FlightDescriptor -// for the the RPC call DoPut to cause the server to execute the included +// for the RPC call DoPut to cause the server to execute the included // prepared statement handle as an update. type CommandPreparedStatementUpdate struct { state protoimpl.MessageState diff --git a/go/arrow/flight/server.go b/go/arrow/flight/server.go index 8676b15644e47..c9c8b390a86d8 100644 --- a/go/arrow/flight/server.go +++ b/go/arrow/flight/server.go @@ -165,7 +165,7 @@ func (s *BaseFlightServer) Handshake(stream flight.FlightService_HandshakeServer } // CustomerServerMiddleware is a helper interface for more easily defining custom -// grpc middlware without having to expose or understand all the grpc bells and whistles. +// grpc middleware without having to expose or understand all the grpc bells and whistles. type CustomServerMiddleware interface { // StartCall will be called with the current context of the call, grpc.SetHeader can be used to add outgoing headers // if the returned context is non-nil, then it will be used as the new context being passed through the calls @@ -224,7 +224,7 @@ type server struct { // the utility of the helpers // // Deprecated: prefer to use NewServerWithMiddleware, due to auth handler middleware -// this function will be problematic if any of the grpc options specify other middlewares. +// this function will be problematic if any of the grpc options specify other middleware. func NewFlightServer(opt ...grpc.ServerOption) Server { opt = append([]grpc.ServerOption{ grpc.ChainStreamInterceptor(serverAuthStreamInterceptor), diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index f74b615362642..84dc638983298 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -1181,7 +1181,7 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr nulls := arr.Count - bitutil.CountSetBits(bitmap.Bytes(), 0, arr.Count) - fields := make([]arrow.ArrayData, len(dt.Fields())) + fields := make([]arrow.ArrayData, dt.NumFields()) for i := range fields { child := arrayFromJSON(mem, dt.Field(i).Type, arr.Children[i]) defer child.Release() @@ -1328,7 +1328,7 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr return array.NewData(dt, arr.Count, []*memory.Buffer{nil}, []arrow.ArrayData{runEnds, values}, 0, 0) case arrow.UnionType: - fields := make([]arrow.ArrayData, len(dt.Fields())) + fields := make([]arrow.ArrayData, dt.NumFields()) for i, f := range dt.Fields() { child := arrayFromJSON(mem, f.Type, arr.Children[i]) defer child.Release() @@ -1620,7 +1620,7 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { Name: field.Name, Count: arr.Len(), Valids: validsToJSON(arr), - Children: make([]Array, len(dt.Fields())), + Children: make([]Array, dt.NumFields()), } for i := range o.Children { o.Children[i] = arrayToJSON(dt.Field(i), arr.Field(i)) @@ -1741,7 +1741,7 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { Count: arr.Len(), Valids: validsToJSON(arr), TypeID: arr.RawTypeCodes(), - Children: make([]Array, len(dt.Fields())), + Children: make([]Array, dt.NumFields()), } if dt.Mode() == arrow.DenseMode { o.Offset = arr.(*array.DenseUnion).RawValueOffsets() diff --git a/go/arrow/internal/arrjson/writer.go b/go/arrow/internal/arrjson/writer.go index af7032d581f4d..fdd36e97d9b42 100644 --- a/go/arrow/internal/arrjson/writer.go +++ b/go/arrow/internal/arrjson/writer.go @@ -90,7 +90,7 @@ func (w *Writer) Close() error { enc := json.NewEncoder(w.w) enc.SetIndent("", jsonIndent) // ensure that we don't convert <, >, !, etc. to their unicode equivalents - // in the output json since we're not using this in an HTML context so that + // in the output json since we aren't using this in an HTML context so that // we can make sure that the json files match. enc.SetEscapeHTML(false) return enc.Encode(w.raw) diff --git a/go/arrow/internal/dictutils/dict.go b/go/arrow/internal/dictutils/dict.go index d31369f7d25b3..5c0bf54dafcbd 100644 --- a/go/arrow/internal/dictutils/dict.go +++ b/go/arrow/internal/dictutils/dict.go @@ -104,7 +104,7 @@ func (d *Mapper) InsertPath(pos FieldPos) { d.hasher.Reset() } -func (d *Mapper) ImportField(pos FieldPos, field *arrow.Field) { +func (d *Mapper) ImportField(pos FieldPos, field arrow.Field) { dt := field.Type if dt.ID() == arrow.EXTENSION { dt = dt.(arrow.ExtensionType).StorageType() @@ -126,13 +126,18 @@ func (d *Mapper) ImportField(pos FieldPos, field *arrow.Field) { func (d *Mapper) ImportFields(pos FieldPos, fields []arrow.Field) { for i := range fields { - d.ImportField(pos.Child(int32(i)), &fields[i]) + d.ImportField(pos.Child(int32(i)), fields[i]) } } func (d *Mapper) ImportSchema(schema *arrow.Schema) { d.pathToID = make(map[uint64]int64) - d.ImportFields(NewFieldPos(), schema.Fields()) + // This code path intentionally avoids calling ImportFields with + // schema.Fields to avoid allocations. + pos := NewFieldPos() + for i := 0; i < schema.NumFields(); i++ { + d.ImportField(pos.Child(int32(i)), schema.Field(i)) + } } func hasUnresolvedNestedDict(data arrow.ArrayData) bool { diff --git a/go/arrow/internal/flatbuf/Timestamp.go b/go/arrow/internal/flatbuf/Timestamp.go index f53211455c06a..d0058e13e6545 100644 --- a/go/arrow/internal/flatbuf/Timestamp.go +++ b/go/arrow/internal/flatbuf/Timestamp.go @@ -121,7 +121,7 @@ import ( /// no indication of how to map this information to a physical point in time. /// Naive date-times must be handled with care because of this missing /// information, and also because daylight saving time (DST) may make -/// some values ambiguous or non-existent. A naive date-time may be +/// some values ambiguous or nonexistent. A naive date-time may be /// stored as a struct with Date and Time fields. However, it may also be /// encoded into a Timestamp column with an empty timezone. The timestamp /// values should be computed "as if" the timezone of the date-time values diff --git a/go/arrow/internal/flight_integration/scenario.go b/go/arrow/internal/flight_integration/scenario.go index 3ec905e2d659c..0b12d22cc7ed7 100644 --- a/go/arrow/internal/flight_integration/scenario.go +++ b/go/arrow/internal/flight_integration/scenario.go @@ -1790,7 +1790,7 @@ func (m *flightSqlScenarioTester) GetSchemaPreparedStatement(ctx context.Context case "SELECT PREPARED STATEMENT WITH TXN HANDLE", "PLAN WITH TXN HANDLE": return &flight.SchemaResult{Schema: flight.SerializeSchema(getQueryWithTransactionSchema(), memory.DefaultAllocator)}, nil } - return nil, fmt.Errorf("%w: invalid handle for GetSchemaPreparedStaement %s", + return nil, fmt.Errorf("%w: invalid handle for GetSchemaPreparedStatement %s", arrow.ErrInvalid, string(cmd.GetPreparedStatementHandle())) } diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go index 57b417bd2b878..3e5a11fe88d2d 100644 --- a/go/arrow/internal/testing/gen/random_array_gen.go +++ b/go/arrow/internal/testing/gen/random_array_gen.go @@ -52,7 +52,7 @@ func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float count := int64(0) r.extra++ - // bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1, + // bernoulli distribution uses P to determine the probability of a 0 or a 1, // which we'll use to generate the bitmap. dist := distuv.Bernoulli{P: 1 - prob, Src: rand.NewSource(r.seed + r.extra)} for i := 0; int64(i) < n; i++ { diff --git a/go/arrow/ipc/compression.go b/go/arrow/ipc/compression.go index 8856b732f9c5d..06a9cf67cfb6b 100644 --- a/go/arrow/ipc/compression.go +++ b/go/arrow/ipc/compression.go @@ -104,7 +104,9 @@ type lz4Decompressor struct { *lz4.Reader } -func (z *lz4Decompressor) Close() {} +func (z *lz4Decompressor) Close() { + z.Reader.Reset(nil) +} func getDecompressor(codec flatbuf.CompressionType) decompressor { switch codec { diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 1c7eb31799cfa..dd51a761510d8 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -351,9 +351,9 @@ func newRecord(schema *arrow.Schema, memo *dictutils.Memo, meta *memory.Buffer, } pos := dictutils.NewFieldPos() - cols := make([]arrow.Array, len(schema.Fields())) - for i, field := range schema.Fields() { - data := ctx.loadArray(field.Type) + cols := make([]arrow.Array, schema.NumFields()) + for i := 0; i < schema.NumFields(); i++ { + data := ctx.loadArray(schema.Field(i).Type) defer data.Release() if err := dictutils.ResolveFieldDict(memo, data, pos.Child(int32(i)), mem); err != nil { @@ -663,7 +663,7 @@ func (ctx *arrayLoaderContext) loadStruct(dt *arrow.StructType) arrow.ArrayData field, buffers := ctx.loadCommon(dt.ID(), 1) defer releaseBuffers(buffers) - subs := make([]arrow.ArrayData, len(dt.Fields())) + subs := make([]arrow.ArrayData, dt.NumFields()) for i, f := range dt.Fields() { subs[i] = ctx.loadChild(f.Type) } @@ -705,7 +705,7 @@ func (ctx *arrayLoaderContext) loadUnion(dt arrow.UnionType) arrow.ArrayData { } defer releaseBuffers(buffers) - subs := make([]arrow.ArrayData, len(dt.Fields())) + subs := make([]arrow.ArrayData, dt.NumFields()) for i, f := range dt.Fields() { subs[i] = ctx.loadChild(f.Type) } @@ -730,6 +730,7 @@ func readDictionary(memo *dictutils.Memo, meta *memory.Buffer, body ReadAtSeeker bodyCompress := data.Compression(nil) if bodyCompress != nil { codec = getDecompressor(bodyCompress.Codec()) + defer codec.Close() } id := md.Id() diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index 54ef58753a173..7a7f9b3e212b7 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -373,7 +373,7 @@ func (fv *fieldVisitor) visit(field arrow.Field) { case *arrow.StructType: fv.dtype = flatbuf.TypeStruct_ - offsets := make([]flatbuffers.UOffsetT, len(dt.Fields())) + offsets := make([]flatbuffers.UOffsetT, dt.NumFields()) for i, field := range dt.Fields() { offsets[i] = fieldToFB(fv.b, fv.pos.Child(int32(i)), field, fv.memo) } @@ -472,7 +472,7 @@ func (fv *fieldVisitor) visit(field arrow.Field) { case arrow.UnionType: fv.dtype = flatbuf.TypeUnion - offsets := make([]flatbuffers.UOffsetT, len(dt.Fields())) + offsets := make([]flatbuffers.UOffsetT, dt.NumFields()) for i, field := range dt.Fields() { offsets[i] = fieldToFB(fv.b, fv.pos.Child(int32(i)), field, fv.memo) } @@ -1100,10 +1100,10 @@ func schemaFromFB(schema *flatbuf.Schema, memo *dictutils.Memo) (*arrow.Schema, } func schemaToFB(b *flatbuffers.Builder, schema *arrow.Schema, memo *dictutils.Mapper) flatbuffers.UOffsetT { - fields := make([]flatbuffers.UOffsetT, len(schema.Fields())) + fields := make([]flatbuffers.UOffsetT, schema.NumFields()) pos := dictutils.NewFieldPos() - for i, field := range schema.Fields() { - fields[i] = fieldToFB(b, pos.Child(int32(i)), field, memo) + for i := 0; i < schema.NumFields(); i++ { + fields[i] = fieldToFB(b, pos.Child(int32(i)), schema.Field(i), memo) } flatbuf.SchemaStartFieldsVector(b, len(fields)) diff --git a/go/arrow/ipc/reader_test.go b/go/arrow/ipc/reader_test.go index f00f3bb3da476..42bb3fea3e963 100644 --- a/go/arrow/ipc/reader_test.go +++ b/go/arrow/ipc/reader_test.go @@ -18,6 +18,8 @@ package ipc import ( "bytes" + "fmt" + "io" "testing" "github.com/apache/arrow/go/v15/arrow" @@ -93,3 +95,91 @@ func TestReaderCheckedAllocator(t *testing.T) { _, err = reader.Read() require.NoError(t, err) } + +func BenchmarkIPC(b *testing.B) { + alloc := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer alloc.AssertSize(b, 0) + + schema := arrow.NewSchema([]arrow.Field{ + { + Name: "s", + Type: &arrow.DictionaryType{ + ValueType: arrow.BinaryTypes.String, + IndexType: arrow.PrimitiveTypes.Int32, + }, + }, + }, nil) + + rb := array.NewRecordBuilder(alloc, schema) + defer rb.Release() + + bldr := rb.Field(0).(*array.BinaryDictionaryBuilder) + bldr.Append([]byte("foo")) + bldr.Append([]byte("bar")) + bldr.Append([]byte("baz")) + + rec := rb.NewRecord() + defer rec.Release() + + for _, codec := range []struct { + name string + codecOption Option + }{ + { + name: "plain", + }, + { + name: "zstd", + codecOption: WithZstd(), + }, + { + name: "lz4", + codecOption: WithLZ4(), + }, + } { + options := []Option{WithSchema(schema), WithAllocator(alloc)} + if codec.codecOption != nil { + options = append(options, codec.codecOption) + } + b.Run(fmt.Sprintf("Writer/codec=%s", codec.name), func(b *testing.B) { + buf := new(bytes.Buffer) + for i := 0; i < b.N; i++ { + func() { + buf.Reset() + writer := NewWriter(buf, options...) + defer writer.Close() + if err := writer.Write(rec); err != nil { + b.Fatal(err) + } + }() + } + }) + + b.Run(fmt.Sprintf("Reader/codec=%s", codec.name), func(b *testing.B) { + buf := new(bytes.Buffer) + writer := NewWriter(buf, options...) + defer writer.Close() + require.NoError(b, writer.Write(rec)) + bufBytes := buf.Bytes() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + func() { + reader, err := NewReader(bytes.NewReader(bufBytes), WithAllocator(alloc)) + if err != nil { + b.Fatal(err) + } + defer reader.Release() + for { + if _, err := reader.Read(); err != nil { + if err == io.EOF { + break + } + b.Fatal(err) + } + } + }() + } + }) + } +} diff --git a/go/arrow/memory/mallocator/mallocator.go b/go/arrow/memory/mallocator/mallocator.go index 18e0377c4fb6a..a111f009ec52d 100644 --- a/go/arrow/memory/mallocator/mallocator.go +++ b/go/arrow/memory/mallocator/mallocator.go @@ -37,7 +37,7 @@ import ( // Mallocator is an allocator which defers to libc malloc. // -// The priamry reason to use this is when exporting data across the C Data +// The primary reason to use this is when exporting data across the C Data // Interface. CGO requires that pointers to Go memory are not stored in C // memory, which is exactly what the C Data Interface would otherwise // require. By allocating with Mallocator up front, we can safely export the diff --git a/go/arrow/scalar/nested.go b/go/arrow/scalar/nested.go index 8250beb5ed90d..cf89dc9fbdc17 100644 --- a/go/arrow/scalar/nested.go +++ b/go/arrow/scalar/nested.go @@ -132,7 +132,7 @@ func NewLargeListScalarData(val arrow.ArrayData) *LargeList { } func makeMapType(typ *arrow.StructType) *arrow.MapType { - debug.Assert(len(typ.Fields()) == 2, "must pass struct with only 2 fields for MapScalar") + debug.Assert(typ.NumFields() == 2, "must pass struct with only 2 fields for MapScalar") return arrow.MapOf(typ.Field(0).Type, typ.Field(1).Type) } @@ -265,7 +265,7 @@ func (s *Struct) Validate() (err error) { } st := s.Type.(*arrow.StructType) - num := len(st.Fields()) + num := st.NumFields() if len(s.Value) != num { return fmt.Errorf("non-null %s scalar should have %d child values, got %d", s.Type, num, len(s.Value)) } @@ -303,7 +303,7 @@ func (s *Struct) ValidateFull() (err error) { } st := s.Type.(*arrow.StructType) - num := len(st.Fields()) + num := st.NumFields() if len(s.Value) != num { return fmt.Errorf("non-null %s scalar should have %d child values, got %d", s.Type, num, len(s.Value)) } @@ -571,8 +571,8 @@ func (s *SparseUnion) Release() { func (s *SparseUnion) Validate() (err error) { dt := s.Type.(*arrow.SparseUnionType) - if len(dt.Fields()) != len(s.Value) { - return fmt.Errorf("sparse union scalar value had %d fields but type has %d fields", len(dt.Fields()), len(s.Value)) + if dt.NumFields() != len(s.Value) { + return fmt.Errorf("sparse union scalar value had %d fields but type has %d fields", dt.NumFields(), len(s.Value)) } if s.TypeCode < 0 || int(s.TypeCode) >= len(dt.ChildIDs()) || dt.ChildIDs()[s.TypeCode] == arrow.InvalidUnionChildID { @@ -593,8 +593,8 @@ func (s *SparseUnion) Validate() (err error) { func (s *SparseUnion) ValidateFull() (err error) { dt := s.Type.(*arrow.SparseUnionType) - if len(dt.Fields()) != len(s.Value) { - return fmt.Errorf("sparse union scalar value had %d fields but type has %d fields", len(dt.Fields()), len(s.Value)) + if dt.NumFields() != len(s.Value) { + return fmt.Errorf("sparse union scalar value had %d fields but type has %d fields", dt.NumFields(), len(s.Value)) } if s.TypeCode < 0 || int(s.TypeCode) >= len(dt.ChildIDs()) || dt.ChildIDs()[s.TypeCode] == arrow.InvalidUnionChildID { @@ -643,7 +643,7 @@ func NewSparseUnionScalar(val []Scalar, code arrow.UnionTypeCode, dt *arrow.Spar func NewSparseUnionScalarFromValue(val Scalar, idx int, dt *arrow.SparseUnionType) *SparseUnion { code := dt.TypeCodes()[idx] - values := make([]Scalar, len(dt.Fields())) + values := make([]Scalar, dt.NumFields()) for i, f := range dt.Fields() { if i == idx { values[i] = val diff --git a/go/arrow/scalar/scalar.go b/go/arrow/scalar/scalar.go index 8b0d3ace2ad78..9744c07fb05a3 100644 --- a/go/arrow/scalar/scalar.go +++ b/go/arrow/scalar/scalar.go @@ -512,7 +512,7 @@ func init() { arrow.LIST: func(dt arrow.DataType) Scalar { return &List{scalar: scalar{dt, false}} }, arrow.STRUCT: func(dt arrow.DataType) Scalar { typ := dt.(*arrow.StructType) - values := make([]Scalar, len(typ.Fields())) + values := make([]Scalar, typ.NumFields()) for i, f := range typ.Fields() { values[i] = MakeNullScalar(f.Type) } @@ -520,10 +520,10 @@ func init() { }, arrow.SPARSE_UNION: func(dt arrow.DataType) Scalar { typ := dt.(*arrow.SparseUnionType) - if len(typ.Fields()) == 0 { + if typ.NumFields() == 0 { panic("cannot make scalar of empty union type") } - values := make([]Scalar, len(typ.Fields())) + values := make([]Scalar, typ.NumFields()) for i, f := range typ.Fields() { values[i] = MakeNullScalar(f.Type) } @@ -531,7 +531,7 @@ func init() { }, arrow.DENSE_UNION: func(dt arrow.DataType) Scalar { typ := dt.(*arrow.DenseUnionType) - if len(typ.Fields()) == 0 { + if typ.NumFields() == 0 { panic("cannot make scalar of empty union type") } return NewDenseUnionScalar(MakeNullScalar(typ.Fields()[0].Type), typ.TypeCodes()[0], typ) diff --git a/go/arrow/scalar/scalar_test.go b/go/arrow/scalar/scalar_test.go index ce8170301b0a2..e85f160624d18 100644 --- a/go/arrow/scalar/scalar_test.go +++ b/go/arrow/scalar/scalar_test.go @@ -1194,7 +1194,7 @@ func makeDenseUnionScalar(ty *arrow.DenseUnionType, val scalar.Scalar, idx int) func makeSpecificNullScalar(dt arrow.UnionType, idx int) scalar.Scalar { switch dt.Mode() { case arrow.SparseMode: - values := make([]scalar.Scalar, len(dt.Fields())) + values := make([]scalar.Scalar, dt.NumFields()) for i, f := range dt.Fields() { values[i] = scalar.MakeNullScalar(f.Type) } diff --git a/go/arrow/schema.go b/go/arrow/schema.go index e84f350a53637..7a05bb1888972 100644 --- a/go/arrow/schema.go +++ b/go/arrow/schema.go @@ -259,8 +259,8 @@ func (s *Schema) AddField(i int, field Field) (*Schema, error) { func (s *Schema) String() string { o := new(strings.Builder) - fmt.Fprintf(o, "schema:\n fields: %d\n", len(s.Fields())) - for i, f := range s.Fields() { + fmt.Fprintf(o, "schema:\n fields: %d\n", s.NumFields()) + for i, f := range s.fields { if i > 0 { o.WriteString("\n") } @@ -282,7 +282,7 @@ func (s *Schema) Fingerprint() string { var b strings.Builder b.WriteString("S{") - for _, f := range s.Fields() { + for _, f := range s.fields { fieldFingerprint := f.Fingerprint() if fieldFingerprint == "" { return "" diff --git a/go/arrow/schema_test.go b/go/arrow/schema_test.go index fddf1d7f131ec..fd94620aee650 100644 --- a/go/arrow/schema_test.go +++ b/go/arrow/schema_test.go @@ -255,7 +255,7 @@ func TestSchema(t *testing.T) { s = s.WithEndianness(endian.NonNativeEndian) } - if got, want := len(s.Fields()), len(tc.fields); got != want { + if got, want := s.NumFields(), len(tc.fields); got != want { t.Fatalf("invalid number of fields. got=%d, want=%d", got, want) } @@ -339,7 +339,7 @@ func TestSchemaAddField(t *testing.T) { if err != nil { t.Fatalf("unexpected error: %v", err) } - if got, want := len(s.Fields()), 3; got != want { + if got, want := s.NumFields(), 3; got != want { t.Fatalf("invalid number of fields. got=%d, want=%d", got, want) } got, want := s.Field(2), Field{Name: "f3", Type: PrimitiveTypes.Int32} @@ -476,5 +476,5 @@ func TestSchemaNumFields(t *testing.T) { assert.NoError(t, err) assert.Equal(t, 3, s.NumFields()) - assert.Equal(t, s.NumFields(), len(s.Fields())) + assert.Equal(t, s.NumFields(), s.NumFields()) } diff --git a/go/go.mod b/go/go.mod index a6c2af7025d32..73a1cb7e7738b 100644 --- a/go/go.mod +++ b/go/go.mod @@ -47,7 +47,9 @@ require ( require ( github.com/google/uuid v1.3.1 + github.com/hamba/avro/v2 v2.17.2 github.com/substrait-io/substrait-go v0.4.2 + github.com/tidwall/sjson v1.2.5 ) require ( @@ -57,14 +59,21 @@ require ( github.com/fatih/color v1.15.0 // indirect github.com/goccy/go-yaml v1.11.0 // indirect github.com/golang/protobuf v1.5.3 // indirect + github.com/json-iterator/go v1.1.12 // indirect github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 // indirect github.com/kr/text v0.2.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.19 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rogpeppe/go-internal v1.9.0 // indirect github.com/stretchr/objx v0.5.0 // indirect + github.com/tidwall/gjson v1.14.2 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.0 // indirect golang.org/x/mod v0.13.0 // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/text v0.13.0 // indirect diff --git a/go/go.sum b/go/go.sum index bdd499c3f5190..2c1edd59e03a3 100644 --- a/go/go.sum +++ b/go/go.sum @@ -34,10 +34,15 @@ github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8i github.com/google/flatbuffers v23.5.26+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hamba/avro/v2 v2.17.2 h1:6PKpEWzJfNnvBgn7m2/8WYaDOUASxfDU+Jyb4ojDgFY= +github.com/hamba/avro/v2 v2.17.2/go.mod h1:Q9YK+qxAhtVrNqOhwlZTATLgLA8qxG2vtvkhK8fJ7Jo= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= @@ -60,6 +65,13 @@ github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpsp github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -73,12 +85,21 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/substrait-io/substrait-go v0.4.2 h1:buDnjsb3qAqTaNbOR7VKmNgXf4lYQxWEcnSGUWBtmN8= github.com/substrait-io/substrait-go v0.4.2/go.mod h1:qhpnLmrcvAnlZsUyPXZRqldiHapPTXC3t7xFgDi3aQg= +github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= diff --git a/go/internal/bitutils/bit_block_counter.go b/go/internal/bitutils/bit_block_counter.go index 99eece34cd068..50996b10e8851 100644 --- a/go/internal/bitutils/bit_block_counter.go +++ b/go/internal/bitutils/bit_block_counter.go @@ -165,7 +165,7 @@ func (b *BitBlockCounter) NextWord() BitBlockCount { } // OptionalBitBlockCounter is a useful counter to iterate through a possibly -// non-existent validity bitmap to allow us to write one code path for both +// nonexistent validity bitmap to allow us to write one code path for both // the with-nulls and no-nulls cases without giving up a lot of performance. type OptionalBitBlockCounter struct { hasBitmap bool diff --git a/go/internal/hashing/xxh3_memo_table.go b/go/internal/hashing/xxh3_memo_table.go index 81994f0a88541..283bc1a953f05 100644 --- a/go/internal/hashing/xxh3_memo_table.go +++ b/go/internal/hashing/xxh3_memo_table.go @@ -57,7 +57,7 @@ type MemoTable interface { // and a boolean indicating whether or not the value was found in // the table (if false, the value was inserted). An error is returned // if val is not the appropriate type for the table. This function is intended to be used by - // the BinaryMemoTable to prevent uncessary allocations of the data when converting from a []byte to interface{}. + // the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}. GetOrInsertBytes(val []byte) (idx int, existed bool, err error) // GetOrInsertNull returns the index of the null value in the table, // inserting one if it hasn't already been inserted. It returns a boolean @@ -67,7 +67,7 @@ type MemoTable interface { // insert one if it doesn't already exist. Will return -1 if it doesn't exist // indicated by a false value for the boolean. GetNull() (idx int, exists bool) - // WriteOut copys the unique values of the memotable out to the byte slice + // WriteOut copies the unique values of the memotable out to the byte slice // provided. Must have allocated enough bytes for all the values. WriteOut(out []byte) // WriteOutSubset is like WriteOut, but only writes a subset of values diff --git a/go/parquet/encryption_properties.go b/go/parquet/encryption_properties.go index 6eb6cf1fe5680..0eadc5fb0451c 100644 --- a/go/parquet/encryption_properties.go +++ b/go/parquet/encryption_properties.go @@ -104,7 +104,7 @@ type colEncryptConfig struct { encrypted bool } -// ColumnEncryptOption how to specify options to the the NewColumnEncryptionProperties function. +// ColumnEncryptOption how to specify options to the NewColumnEncryptionProperties function. type ColumnEncryptOption func(*colEncryptConfig) // WithKey sets a column specific key. @@ -350,7 +350,7 @@ type FileDecryptionOption func(*fileDecryptConfig) // metadata the metadata will be ignored, the footer will be decrypted/verified with this key. // // If the explicit key is not set, footer key will be fetched from the key retriever. -// With explcit keys or AAD prefix, new encryption properties object must be created for each +// With explicit keys or AAD prefix, new encryption properties object must be created for each // encrypted file. // // Explicit encryption keys (footer and column) are cloned. @@ -379,7 +379,7 @@ func WithPrefixVerifier(verifier AADPrefixVerifier) FileDecryptionOption { // It's also possible to set a key retriever on this property object. // // Upon file decryption, availability of explicit keys is checked before invocation -// of the retreiver callback. +// of the retriever callback. // // If an explicit key is available for a footer or a column, its key metadata will be ignored. func WithColumnKeys(decrypt ColumnPathToDecryptionPropsMap) FileDecryptionOption { diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index 134c290fa6894..05c1cadebf6cf 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -304,7 +304,7 @@ func (p *PrimitiveWriterTestSuite) writeRequiredWithSettings(encoding parquet.En } writer := p.buildWriter(nrows, columnProperties, parquet.WithVersion(parquet.V1_0)) p.WriteBatchValues(writer, nil, nil) - // behavior should be independant of the number of calls to Close + // behavior should be independent of the number of calls to Close writer.Close() writer.Close() } @@ -321,7 +321,7 @@ func (p *PrimitiveWriterTestSuite) writeRequiredWithSettingsSpaced(encoding parq } writer := p.buildWriter(nrows, columnProperties, parquet.WithVersion(parquet.V1_0)) p.WriteBatchValuesSpaced(writer, nil, nil, validBits, 0) - // behavior should be independant from the number of close calls + // behavior should be independent from the number of close calls writer.Close() writer.Close() } diff --git a/go/parquet/file/file_reader.go b/go/parquet/file/file_reader.go index afff579ded5b7..29162e4a4ec27 100644 --- a/go/parquet/file/file_reader.go +++ b/go/parquet/file/file_reader.go @@ -237,7 +237,7 @@ func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, aadPrefixInFile := algo.Aad.AadPrefix if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" { - return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not suppliedin decryption props") + return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not supplied in decryption props") } if fileHasAadPrefix { diff --git a/go/parquet/file/level_conversion.go b/go/parquet/file/level_conversion.go index c23bdda445963..f6707fce86d80 100755 --- a/go/parquet/file/level_conversion.go +++ b/go/parquet/file/level_conversion.go @@ -94,7 +94,7 @@ func (l *LevelInfo) IncrementRepeated() int16 { l.RepLevel++ l.DefLevel++ - // For levels >= repeated_ancenstor_def_level it indicates the list was + // For levels >= repeated_ancestor_def_level it indicates the list was // non-null and had at least one element. This is important // for later decoding because we need to add a slot for these // values. for levels < current_def_level no slots are added diff --git a/go/parquet/file/level_conversion_test.go b/go/parquet/file/level_conversion_test.go index 5d5bdde90dc7e..54e52c5e7abb6 100644 --- a/go/parquet/file/level_conversion_test.go +++ b/go/parquet/file/level_conversion_test.go @@ -66,7 +66,7 @@ func TestDefLevelsToBitmap(t *testing.T) { assert.Equal(t, curByte, validBits[1]) } -func TestDefLevelstToBitmapPowerOf2(t *testing.T) { +func TestDefLevelsToBitmapPowerOf2(t *testing.T) { defLevels := []int16{3, 3, 3, 2, 3, 3, 3, 3} validBits := []byte{1, 0} diff --git a/go/parquet/file/record_reader.go b/go/parquet/file/record_reader.go index ad836d29ef483..5698f49d9f2bb 100755 --- a/go/parquet/file/record_reader.go +++ b/go/parquet/file/record_reader.go @@ -486,7 +486,7 @@ func (rr *recordReader) delimitRecords(numRecords int64) (recordsRead, valsToRea // if at record start, we are seeing the start of a record // for the second time, such as after repeated calls to delimitrecords. // in this case we must continue until we find another record start - // or exaust the column chunk + // or exhaust the column chunk if !rr.atRecStart { // end of a record, increment count recordsRead++ diff --git a/go/parquet/file/row_group_writer.go b/go/parquet/file/row_group_writer.go index 935c13d13a67f..74f4becb55e08 100644 --- a/go/parquet/file/row_group_writer.go +++ b/go/parquet/file/row_group_writer.go @@ -52,7 +52,7 @@ type SerialRowGroupWriter interface { RowGroupWriter NextColumn() (ColumnChunkWriter, error) // returns the current column being built, if buffered it will equal NumColumns - // if serialized then it will return which column is currenly being written + // if serialized then it will return which column is currently being written CurrentColumn() int } diff --git a/go/parquet/internal/encoding/delta_byte_array.go b/go/parquet/internal/encoding/delta_byte_array.go index b35d022fd5deb..57b0c8a70e5ad 100644 --- a/go/parquet/internal/encoding/delta_byte_array.go +++ b/go/parquet/internal/encoding/delta_byte_array.go @@ -148,7 +148,7 @@ func (DeltaByteArrayDecoder) Type() parquet.Type { func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem } -// SetData expects the data passed in to be the prefix lengths, followed by the +// SetData expects the passed in data to be the prefix lengths, followed by the // blocks of suffix data in order to initialize the decoder. func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error { prefixLenDec := DeltaBitPackInt32Decoder{ diff --git a/go/parquet/internal/encoding/memo_table.go b/go/parquet/internal/encoding/memo_table.go index 810e8633b886e..a36ad32973c96 100644 --- a/go/parquet/internal/encoding/memo_table.go +++ b/go/parquet/internal/encoding/memo_table.go @@ -100,7 +100,7 @@ type BinaryMemoTable interface { Retain() // Release decreases the reference count by 1 of the separately stored binary data // kept alongside the table containing the values. When the reference count goes to - // 0, the memory is freed. This is safe to call across multiple goroutines simultaneoulsy. + // 0, the memory is freed. This is safe to call across multiple goroutines simultaneously. Release() } diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go b/go/parquet/internal/encoding/typed_encoder.gen.go index 25fa309e0a38f..4bc18e8c63c01 100644 --- a/go/parquet/internal/encoding/typed_encoder.gen.go +++ b/go/parquet/internal/encoding/typed_encoder.gen.go @@ -192,7 +192,7 @@ func (DictInt32Decoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictInt32Decoder) Decode(out []int32) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) @@ -429,7 +429,7 @@ func (DictInt64Decoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictInt64Decoder) Decode(out []int64) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) @@ -644,7 +644,7 @@ func (DictInt96Decoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictInt96Decoder) Decode(out []parquet.Int96) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) @@ -869,7 +869,7 @@ func (DictFloat32Decoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictFloat32Decoder) Decode(out []float32) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) @@ -1094,7 +1094,7 @@ func (DictFloat64Decoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictFloat64Decoder) Decode(out []float64) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) @@ -1362,7 +1362,7 @@ func (DictByteArrayDecoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) @@ -1541,7 +1541,7 @@ func (DictFixedLenByteArrayDecoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *DictFixedLenByteArrayDecoder) Decode(out []parquet.FixedLenByteArray) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) diff --git a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl index 66c2649d599c6..d72f31512047a 100644 --- a/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl +++ b/go/parquet/internal/encoding/typed_encoder.gen.go.tmpl @@ -268,7 +268,7 @@ func (Dict{{.Name}}Decoder) Type() parquet.Type { } // Decode populates the passed in slice with min(len(out), remaining values) values, -// decoding using hte dictionary to get the actual values. Returns the number of values +// decoding using the dictionary to get the actual values. Returns the number of values // actually decoded and any error encountered. func (d *Dict{{.Name}}Decoder) Decode(out []{{.name}}) (int, error) { vals := shared_utils.MinInt(len(out), d.nvals) diff --git a/go/parquet/internal/testutils/primitive_typed.go b/go/parquet/internal/testutils/primitive_typed.go index daab0d427ba10..50627b2e275ff 100644 --- a/go/parquet/internal/testutils/primitive_typed.go +++ b/go/parquet/internal/testutils/primitive_typed.go @@ -116,7 +116,7 @@ func (p *PrimitiveTypedTest) UpdateStatsSpaced(stat metadata.TypedStatistics, nu case *metadata.FixedLenByteArrayStatistics: s.UpdateSpaced(p.Values.([]parquet.FixedLenByteArray), validBits, validBitsOffset, numNull) default: - panic("uninplemented") + panic("unimplemented") } } diff --git a/go/parquet/internal/testutils/random.go b/go/parquet/internal/testutils/random.go index bb9ee0cdf2bba..4d697693510d8 100644 --- a/go/parquet/internal/testutils/random.go +++ b/go/parquet/internal/testutils/random.go @@ -60,7 +60,7 @@ func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float count := int64(0) r.extra++ - // bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1, + // bernoulli distribution uses P to determine the probability of a 0 or a 1, // which we'll use to generate the bitmap. dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)} for i := 0; int64(i) < n; i++ { diff --git a/go/parquet/internal/utils/_lib/README.md b/go/parquet/internal/utils/_lib/README.md index 10cc9a257e055..17c3006a5ce08 100644 --- a/go/parquet/internal/utils/_lib/README.md +++ b/go/parquet/internal/utils/_lib/README.md @@ -144,10 +144,10 @@ resulting assembly. function, ends with something akin to `MOVD R0, num+32(FP)`. Where `num` is the local variable name of the return value, and `32` is the byte size of the arguments. -To faciliate some automation, a `script.sed` file is provided in this directory which +To facilitate some automation, a `script.sed` file is provided in this directory which can be run against the generated assembly from `c2goasm` as `sed -f _lib/script.sed -i bit_packing_neon_arm64.s` which will perform several of -these steps on the generated assembly such as convering `b.le`/etc calls with labels +these steps on the generated assembly such as converting `b.le`/etc calls with labels to proper `BLE LBB0_....` lines, and converting `adrp`/`ldr` pairs to `VMOVD` and `VMOVQ` instructions. diff --git a/go/parquet/internal/utils/_lib/bit_packing_neon.c b/go/parquet/internal/utils/_lib/bit_packing_neon.c index c8dd97c2a36a2..6d09eeb75b6a8 100755 --- a/go/parquet/internal/utils/_lib/bit_packing_neon.c +++ b/go/parquet/internal/utils/_lib/bit_packing_neon.c @@ -38,7 +38,7 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_6th[4] = {20, 21, 22, 23}; uint32_t shifts_7th[4] = {24, 25, 26, 27}; uint32_t shifts_8th[4] = {28, 29, 30, 31}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -48,8 +48,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -58,8 +58,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -68,8 +68,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_3rd[1]; ind[2] = in[0] >> shifts_3rd[2]; ind[3] = in[0] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -78,8 +78,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_4th[1]; ind[2] = in[0] >> shifts_4th[2]; ind[3] = in[0] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -88,8 +88,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_5th[1]; ind[2] = in[0] >> shifts_5th[2]; ind[3] = in[0] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -98,8 +98,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_6th[1]; ind[2] = in[0] >> shifts_6th[2]; ind[3] = in[0] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -108,8 +108,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_7th[1]; ind[2] = in[0] >> shifts_7th[2]; ind[3] = in[0] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -118,8 +118,8 @@ inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_8th[1]; ind[2] = in[0] >> shifts_8th[2]; ind[3] = in[0] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -136,7 +136,7 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_3rd[4] = {16, 18, 20, 22}; uint32_t shifts_4th[4] = {24, 26, 28, 30}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -146,8 +146,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -156,8 +156,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -166,8 +166,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_3rd[1]; ind[2] = in[0] >> shifts_3rd[2]; ind[3] = in[0] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -176,8 +176,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_4th[1]; ind[2] = in[0] >> shifts_4th[2]; ind[3] = in[0] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -186,8 +186,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -196,8 +196,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -206,8 +206,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_3rd[1]; ind[2] = in[1] >> shifts_3rd[2]; ind[3] = in[1] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -216,8 +216,8 @@ inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_4th[1]; ind[2] = in[1] >> shifts_4th[2]; ind[3] = in[1] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -237,7 +237,7 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_6th[4] = {28, 0, 2, 5}; uint32_t shifts_7th[4] = {8, 11, 14, 17}; uint32_t shifts_8th[4] = {20, 23, 26, 29}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -247,8 +247,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -257,8 +257,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -267,8 +267,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_3rd[1]; ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_3rd[2]; ind[3] = in[1] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -277,8 +277,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_4th[1]; ind[2] = in[1] >> shifts_4th[2]; ind[3] = in[1] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -287,8 +287,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_5th[1]; ind[2] = in[1] >> shifts_5th[2]; ind[3] = in[1] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -297,8 +297,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_6th[1]; ind[2] = in[2] >> shifts_6th[2]; ind[3] = in[2] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -307,8 +307,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_7th[1]; ind[2] = in[2] >> shifts_7th[2]; ind[3] = in[2] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -317,8 +317,8 @@ inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_8th[1]; ind[2] = in[2] >> shifts_8th[2]; ind[3] = in[2] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -332,7 +332,7 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 4, 8, 12}; uint32_t shifts_2nd[4] = {16, 20, 24, 28}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -342,8 +342,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -352,8 +352,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -362,8 +362,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -372,8 +372,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -382,8 +382,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_1st[1]; ind[2] = in[2] >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -392,8 +392,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -402,8 +402,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -412,8 +412,8 @@ inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_2nd[1]; ind[2] = in[3] >> shifts_2nd[2]; ind[3] = in[3] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -433,7 +433,7 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_6th[4] = {4, 9, 14, 19}; uint32_t shifts_7th[4] = {24, 0, 2, 7}; uint32_t shifts_8th[4] = {12, 17, 22, 27}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -443,8 +443,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -453,8 +453,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_2nd[1]; ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -463,8 +463,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_3rd[1]; ind[2] = in[1] >> shifts_3rd[2]; ind[3] = in[1] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -473,8 +473,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_4th[1]; ind[2] = in[2] >> shifts_4th[2]; ind[3] = in[2] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -483,8 +483,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_5th[1]; ind[2] = in[2] >> shifts_5th[2]; ind[3] = (in[2] >> 31 | in[3] << 1) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -493,8 +493,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_6th[1]; ind[2] = in[3] >> shifts_6th[2]; ind[3] = in[3] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -503,8 +503,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_7th[1]; ind[2] = in[4] >> shifts_7th[2]; ind[3] = in[4] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -513,8 +513,8 @@ inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[4] >> shifts_8th[1]; ind[2] = in[4] >> shifts_8th[2]; ind[3] = in[4] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -531,7 +531,7 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_3rd[4] = {16, 22, 0, 2}; uint32_t shifts_4th[4] = {8, 14, 20, 26}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -541,8 +541,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -551,8 +551,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -561,8 +561,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_3rd[1]; ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_3rd[2]; ind[3] = in[2] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -571,8 +571,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_4th[1]; ind[2] = in[2] >> shifts_4th[2]; ind[3] = in[2] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -581,8 +581,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -591,8 +591,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[1]; ind[2] = in[4] >> shifts_2nd[2]; ind[3] = in[4] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -601,8 +601,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[4] >> shifts_3rd[1]; ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_3rd[2]; ind[3] = in[5] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -611,8 +611,8 @@ inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[5] >> shifts_4th[1]; ind[2] = in[5] >> shifts_4th[2]; ind[3] = in[5] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -632,7 +632,7 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_6th[4] = {12, 19, 0, 1}; uint32_t shifts_7th[4] = {8, 15, 22, 0}; uint32_t shifts_8th[4] = {4, 11, 18, 25}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -642,8 +642,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -652,8 +652,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -662,8 +662,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_3rd[1]; ind[2] = in[2] >> shifts_3rd[2]; ind[3] = in[2] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -672,8 +672,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[2] >> 27 | in[3] << 5) >> shifts_4th[1]; ind[2] = in[3] >> shifts_4th[2]; ind[3] = in[3] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -682,8 +682,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_5th[1]; ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_5th[2]; ind[3] = in[4] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -692,8 +692,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[4] >> shifts_6th[1]; ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_6th[2]; ind[3] = in[5] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -702,8 +702,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[5] >> shifts_7th[1]; ind[2] = in[5] >> shifts_7th[2]; ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -712,8 +712,8 @@ inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[6] >> shifts_8th[1]; ind[2] = in[6] >> shifts_8th[2]; ind[3] = in[6] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -726,7 +726,7 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) uint32_t mask = 0xff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 8, 16, 24}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -736,8 +736,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -746,8 +746,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -756,8 +756,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_1st[1]; ind[2] = in[2] >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -766,8 +766,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -776,8 +776,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[4] >> shifts_1st[1]; ind[2] = in[4] >> shifts_1st[2]; ind[3] = in[4] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -786,8 +786,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[5] >> shifts_1st[1]; ind[2] = in[5] >> shifts_1st[2]; ind[3] = in[5] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -796,8 +796,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[6] >> shifts_1st[1]; ind[2] = in[6] >> shifts_1st[2]; ind[3] = in[6] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -806,8 +806,8 @@ inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[7] >> shifts_1st[1]; ind[2] = in[7] >> shifts_1st[2]; ind[3] = in[7] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -827,7 +827,7 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) uint32_t shifts_6th[4] = {20, 0, 6, 15}; uint32_t shifts_7th[4] = {0, 1, 10, 19}; uint32_t shifts_8th[4] = {0, 5, 14, 23}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -837,8 +837,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -847,8 +847,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -857,8 +857,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[2] >> shifts_3rd[1]; ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[2]; ind[3] = in[3] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -867,8 +867,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[3] >> shifts_4th[1]; ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_4th[2]; ind[3] = in[4] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -877,8 +877,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_5th[1]; ind[2] = in[5] >> shifts_5th[2]; ind[3] = in[5] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -887,8 +887,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_6th[1]; ind[2] = in[6] >> shifts_6th[2]; ind[3] = in[6] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -897,8 +897,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[7] >> shifts_7th[1]; ind[2] = in[7] >> shifts_7th[2]; ind[3] = in[7] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -907,8 +907,8 @@ inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) ind[1] = in[8] >> shifts_8th[1]; ind[2] = in[8] >> shifts_8th[2]; ind[3] = in[8] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -924,7 +924,7 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_2nd[4] = {8, 18, 0, 6}; uint32_t shifts_3rd[4] = {16, 0, 4, 14}; uint32_t shifts_4th[4] = {0, 2, 12, 22}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -934,8 +934,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -944,8 +944,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[1] >> shifts_2nd[1]; ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -954,8 +954,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[1]; ind[2] = in[3] >> shifts_3rd[2]; ind[3] = in[3] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -964,8 +964,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[4] >> shifts_4th[1]; ind[2] = in[4] >> shifts_4th[2]; ind[3] = in[4] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -974,8 +974,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[5] >> shifts_1st[1]; ind[2] = in[5] >> shifts_1st[2]; ind[3] = (in[5] >> 30 | in[6] << 2) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -984,8 +984,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[6] >> shifts_2nd[1]; ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_2nd[2]; ind[3] = in[7] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -994,8 +994,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[1]; ind[2] = in[8] >> shifts_3rd[2]; ind[3] = in[8] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1004,8 +1004,8 @@ inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[9] >> shifts_4th[1]; ind[2] = in[9] >> shifts_4th[2]; ind[3] = in[9] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1025,7 +1025,7 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 7, 18, 0}; uint32_t shifts_7th[4] = {8, 19, 0, 9}; uint32_t shifts_8th[4] = {20, 0, 10, 21}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1035,8 +1035,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1045,8 +1045,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[1] >> 23 | in[2] << 9) >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1055,8 +1055,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[3] >> shifts_3rd[1]; ind[2] = in[3] >> shifts_3rd[2]; ind[3] = (in[3] >> 25 | in[4] << 7) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1065,8 +1065,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[4] >> shifts_4th[1]; ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_4th[2]; ind[3] = in[5] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1075,8 +1075,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[5] >> 27 | in[6] << 5) >> shifts_5th[1]; ind[2] = in[6] >> shifts_5th[2]; ind[3] = in[6] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1085,8 +1085,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[7] >> shifts_6th[1]; ind[2] = in[7] >> shifts_6th[2]; ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1095,8 +1095,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[8] >> shifts_7th[1]; ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_7th[2]; ind[3] = in[9] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1105,8 +1105,8 @@ inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[9] >> 31 | in[10] << 1) >> shifts_8th[1]; ind[2] = in[10] >> shifts_8th[2]; ind[3] = in[10] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1120,7 +1120,7 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 12, 0, 4}; uint32_t shifts_2nd[4] = {16, 0, 8, 20}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1130,8 +1130,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1140,8 +1140,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1150,8 +1150,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[3] >> shifts_1st[1]; ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[2]; ind[3] = in[4] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1160,8 +1160,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[1]; ind[2] = in[5] >> shifts_2nd[2]; ind[3] = in[5] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1170,8 +1170,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[6] >> shifts_1st[1]; ind[2] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[2]; ind[3] = in[7] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1180,8 +1180,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_2nd[1]; ind[2] = in[8] >> shifts_2nd[2]; ind[3] = in[8] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1190,8 +1190,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[9] >> shifts_1st[1]; ind[2] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[2]; ind[3] = in[10] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1200,8 +1200,8 @@ inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[10] >> 28 | in[11] << 4) >> shifts_2nd[1]; ind[2] = in[11] >> shifts_2nd[2]; ind[3] = in[11] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1221,7 +1221,7 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {4, 17, 0, 11}; uint32_t shifts_7th[4] = {0, 5, 18, 0}; uint32_t shifts_8th[4] = {12, 0, 6, 19}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1231,8 +1231,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1241,8 +1241,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[2] >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = (in[2] >> 27 | in[3] << 5) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1251,8 +1251,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[3] >> 21 | in[4] << 11) >> shifts_3rd[1]; ind[2] = in[4] >> shifts_3rd[2]; ind[3] = in[4] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1261,8 +1261,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[5] >> shifts_4th[1]; ind[2] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[2]; ind[3] = in[6] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1271,8 +1271,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_5th[1]; ind[2] = in[7] >> shifts_5th[2]; ind[3] = (in[7] >> 23 | in[8] << 9) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1281,8 +1281,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[8] >> shifts_6th[1]; ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_6th[2]; ind[3] = in[9] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1291,8 +1291,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[10] >> shifts_7th[1]; ind[2] = in[10] >> shifts_7th[2]; ind[3] = (in[10] >> 31 | in[11] << 1) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1301,8 +1301,8 @@ inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_8th[1]; ind[2] = in[12] >> shifts_8th[2]; ind[3] = in[12] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1318,7 +1318,7 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_2nd[4] = {0, 6, 0, 2}; uint32_t shifts_3rd[4] = {16, 0, 12, 0}; uint32_t shifts_4th[4] = {8, 0, 4, 18}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1328,8 +1328,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1338,8 +1338,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[2] >> shifts_2nd[1]; ind[2] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[2]; ind[3] = in[3] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1348,8 +1348,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_3rd[1]; ind[2] = in[4] >> shifts_3rd[2]; ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1358,8 +1358,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[1]; ind[2] = in[6] >> shifts_4th[2]; ind[3] = in[6] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1368,8 +1368,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[7] >> shifts_1st[1]; ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[2]; ind[3] = in[8] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1378,8 +1378,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[9] >> shifts_2nd[1]; ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_2nd[2]; ind[3] = in[10] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1388,8 +1388,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[10] >> 30 | in[11] << 2) >> shifts_3rd[1]; ind[2] = in[11] >> shifts_3rd[2]; ind[3] = (in[11] >> 26 | in[12] << 6) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1398,8 +1398,8 @@ inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[1]; ind[2] = in[13] >> shifts_4th[2]; ind[3] = in[13] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1419,7 +1419,7 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {12, 0, 10, 0}; uint32_t shifts_7th[4] = {8, 0, 6, 0}; uint32_t shifts_8th[4] = {4, 0, 2, 17}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1429,8 +1429,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1439,8 +1439,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[2] >> shifts_2nd[1]; ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[2]; ind[3] = in[3] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1449,8 +1449,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[4] >> shifts_3rd[1]; ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_3rd[2]; ind[3] = in[5] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1459,8 +1459,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[6] >> shifts_4th[1]; ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_4th[2]; ind[3] = in[7] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1469,8 +1469,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 31 | in[8] << 1) >> shifts_5th[1]; ind[2] = in[8] >> shifts_5th[2]; ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1479,8 +1479,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[9] >> 27 | in[10] << 5) >> shifts_6th[1]; ind[2] = in[10] >> shifts_6th[2]; ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1489,8 +1489,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[11] >> 23 | in[12] << 9) >> shifts_7th[1]; ind[2] = in[12] >> shifts_7th[2]; ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1499,8 +1499,8 @@ inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[13] >> 19 | in[14] << 13) >> shifts_8th[1]; ind[2] = in[14] >> shifts_8th[2]; ind[3] = in[14] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1513,7 +1513,7 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out uint32_t mask = 0xffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 16, 0, 16}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1523,8 +1523,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1533,8 +1533,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[2] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1543,8 +1543,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[4] >> shifts_1st[1]; ind[2] = in[5] >> shifts_1st[2]; ind[3] = in[5] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1553,8 +1553,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[6] >> shifts_1st[1]; ind[2] = in[7] >> shifts_1st[2]; ind[3] = in[7] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1563,8 +1563,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[8] >> shifts_1st[1]; ind[2] = in[9] >> shifts_1st[2]; ind[3] = in[9] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1573,8 +1573,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[10] >> shifts_1st[1]; ind[2] = in[11] >> shifts_1st[2]; ind[3] = in[11] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1583,8 +1583,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[12] >> shifts_1st[1]; ind[2] = in[13] >> shifts_1st[2]; ind[3] = in[13] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1593,8 +1593,8 @@ inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[14] >> shifts_1st[1]; ind[2] = in[15] >> shifts_1st[2]; ind[3] = in[15] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1614,7 +1614,7 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 5, 0, 7}; uint32_t shifts_7th[4] = {0, 9, 0, 11}; uint32_t shifts_8th[4] = {0, 13, 0, 15}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1624,8 +1624,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 17 | in[1] << 15) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 19 | in[2] << 13) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1634,8 +1634,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[2] >> 21 | in[3] << 11) >> shifts_2nd[1]; ind[2] = in[3] >> shifts_2nd[2]; ind[3] = (in[3] >> 23 | in[4] << 9) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1644,8 +1644,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_3rd[1]; ind[2] = in[5] >> shifts_3rd[2]; ind[3] = (in[5] >> 27 | in[6] << 5) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1654,8 +1654,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_4th[1]; ind[2] = in[7] >> shifts_4th[2]; ind[3] = (in[7] >> 31 | in[8] << 1) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1664,8 +1664,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[9] >> shifts_5th[1]; ind[2] = (in[9] >> 18 | in[10] << 14) >> shifts_5th[2]; ind[3] = in[10] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1674,8 +1674,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[11] >> shifts_6th[1]; ind[2] = (in[11] >> 22 | in[12] << 10) >> shifts_6th[2]; ind[3] = in[12] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1684,8 +1684,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[13] >> shifts_7th[1]; ind[2] = (in[13] >> 26 | in[14] << 6) >> shifts_7th[2]; ind[3] = in[14] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1694,8 +1694,8 @@ inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[15] >> shifts_8th[1]; ind[2] = (in[15] >> 30 | in[16] << 2) >> shifts_8th[2]; ind[3] = in[16] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1711,7 +1711,7 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_2nd[4] = {8, 0, 12, 0}; uint32_t shifts_3rd[4] = {0, 2, 0, 6}; uint32_t shifts_4th[4] = {0, 10, 0, 14}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1721,8 +1721,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 18 | in[1] << 14) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1731,8 +1731,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[1]; ind[2] = in[3] >> shifts_2nd[2]; ind[3] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1741,8 +1741,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[5] >> shifts_3rd[1]; ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_3rd[2]; ind[3] = in[6] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1751,8 +1751,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[7] >> shifts_4th[1]; ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[2]; ind[3] = in[8] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1761,8 +1761,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[9] >> 18 | in[10] << 14) >> shifts_1st[1]; ind[2] = in[10] >> shifts_1st[2]; ind[3] = (in[10] >> 22 | in[11] << 10) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1771,8 +1771,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[11] >> 26 | in[12] << 6) >> shifts_2nd[1]; ind[2] = in[12] >> shifts_2nd[2]; ind[3] = (in[12] >> 30 | in[13] << 2) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1781,8 +1781,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[14] >> shifts_3rd[1]; ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_3rd[2]; ind[3] = in[15] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1791,8 +1791,8 @@ inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[16] >> shifts_4th[1]; ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_4th[2]; ind[3] = in[17] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1812,7 +1812,7 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 0, 2, 0}; uint32_t shifts_7th[4] = {8, 0, 0, 1}; uint32_t shifts_8th[4] = {0, 7, 0, 13}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1822,8 +1822,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 19 | in[1] << 13) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 25 | in[2] << 7) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1832,8 +1832,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[2] >> 31 | in[3] << 1) >> shifts_2nd[1]; ind[2] = (in[3] >> 18 | in[4] << 14) >> shifts_2nd[2]; ind[3] = in[4] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1842,8 +1842,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[5] >> shifts_3rd[1]; ind[2] = (in[5] >> 30 | in[6] << 2) >> shifts_3rd[2]; ind[3] = (in[6] >> 17 | in[7] << 15) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1852,8 +1852,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 23 | in[8] << 9) >> shifts_4th[1]; ind[2] = in[8] >> shifts_4th[2]; ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1862,8 +1862,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[10] >> shifts_5th[1]; ind[2] = (in[10] >> 22 | in[11] << 10) >> shifts_5th[2]; ind[3] = in[11] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1872,8 +1872,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[12] >> 15 | in[13] << 17) >> shifts_6th[1]; ind[2] = in[13] >> shifts_6th[2]; ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1882,8 +1882,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[14] >> 27 | in[15] << 5) >> shifts_7th[1]; ind[2] = (in[15] >> 14 | in[16] << 18) >> shifts_7th[2]; ind[3] = in[16] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1892,8 +1892,8 @@ inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[17] >> shifts_8th[1]; ind[2] = (in[17] >> 26 | in[18] << 6) >> shifts_8th[2]; ind[3] = in[18] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1907,7 +1907,7 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 8, 0}; uint32_t shifts_2nd[4] = {0, 4, 0, 12}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -1917,8 +1917,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 20 | in[1] << 12) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1927,8 +1927,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[3] >> shifts_2nd[1]; ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[2]; ind[3] = in[4] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1937,8 +1937,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[5] >> 20 | in[6] << 12) >> shifts_1st[1]; ind[2] = in[6] >> shifts_1st[2]; ind[3] = (in[6] >> 28 | in[7] << 4) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1947,8 +1947,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[8] >> shifts_2nd[1]; ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[2]; ind[3] = in[9] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1957,8 +1957,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[10] >> 20 | in[11] << 12) >> shifts_1st[1]; ind[2] = in[11] >> shifts_1st[2]; ind[3] = (in[11] >> 28 | in[12] << 4) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1967,8 +1967,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[13] >> shifts_2nd[1]; ind[2] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[2]; ind[3] = in[14] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1977,8 +1977,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[15] >> 20 | in[16] << 12) >> shifts_1st[1]; ind[2] = in[16] >> shifts_1st[2]; ind[3] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -1987,8 +1987,8 @@ inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[18] >> shifts_2nd[1]; ind[2] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[2]; ind[3] = in[19] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2008,7 +2008,7 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {4, 0, 0, 3}; uint32_t shifts_7th[4] = {0, 0, 2, 0}; uint32_t shifts_8th[4] = {0, 1, 0, 11}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2018,8 +2018,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 21 | in[1] << 11) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2028,8 +2028,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[3] >> shifts_2nd[1]; ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[2]; ind[3] = (in[4] >> 19 | in[5] << 13) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2038,8 +2038,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_3rd[1]; ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_3rd[2]; ind[3] = in[7] >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2048,8 +2048,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[8] >> 17 | in[9] << 15) >> shifts_4th[1]; ind[2] = in[9] >> shifts_4th[2]; ind[3] = (in[9] >> 27 | in[10] << 5) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2058,8 +2058,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[11] >> shifts_5th[1]; ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_5th[2]; ind[3] = (in[12] >> 15 | in[13] << 17) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2068,8 +2068,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[13] >> 25 | in[14] << 7) >> shifts_6th[1]; ind[2] = (in[14] >> 14 | in[15] << 18) >> shifts_6th[2]; ind[3] = in[15] >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2078,8 +2078,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_7th[1]; ind[2] = in[17] >> shifts_7th[2]; ind[3] = (in[17] >> 23 | in[18] << 9) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2088,8 +2088,8 @@ inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[19] >> shifts_8th[1]; ind[2] = (in[19] >> 22 | in[20] << 10) >> shifts_8th[2]; ind[3] = in[20] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2105,7 +2105,7 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_2nd[4] = {0, 0, 4, 0}; uint32_t shifts_3rd[4] = {0, 6, 0, 0}; uint32_t shifts_4th[4] = {8, 0, 0, 10}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2115,8 +2115,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[1]; ind[2] = (in[1] >> 12 | in[2] << 20) >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2125,8 +2125,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[3] >> 14 | in[4] << 18) >> shifts_2nd[1]; ind[2] = in[4] >> shifts_2nd[2]; ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2135,8 +2135,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[6] >> shifts_3rd[1]; ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_3rd[2]; ind[3] = (in[7] >> 18 | in[8] << 14) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2145,8 +2145,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[8] >> 30 | in[9] << 2) >> shifts_4th[1]; ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_4th[2]; ind[3] = in[10] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2155,8 +2155,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[11] >> 22 | in[12] << 10) >> shifts_1st[1]; ind[2] = (in[12] >> 12 | in[13] << 20) >> shifts_1st[2]; ind[3] = in[13] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2165,8 +2165,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[14] >> 14 | in[15] << 18) >> shifts_2nd[1]; ind[2] = in[15] >> shifts_2nd[2]; ind[3] = (in[15] >> 26 | in[16] << 6) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2175,8 +2175,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[17] >> shifts_3rd[1]; ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_3rd[2]; ind[3] = (in[18] >> 18 | in[19] << 14) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2185,8 +2185,8 @@ inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[19] >> 30 | in[20] << 2) >> shifts_4th[1]; ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_4th[2]; ind[3] = in[21] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2206,7 +2206,7 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 3, 0, 0}; uint32_t shifts_7th[4] = {8, 0, 0, 0}; uint32_t shifts_8th[4] = {4, 0, 0, 9}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2216,8 +2216,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 23 | in[1] << 9) >> shifts_1st[1]; ind[2] = (in[1] >> 14 | in[2] << 18) >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2226,8 +2226,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[3] >> 19 | in[4] << 13) >> shifts_2nd[1]; ind[2] = (in[4] >> 10 | in[5] << 22) >> shifts_2nd[2]; ind[3] = in[5] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2236,8 +2236,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[6] >> 15 | in[7] << 17) >> shifts_3rd[1]; ind[2] = in[7] >> shifts_3rd[2]; ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2246,8 +2246,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[9] >> 11 | in[10] << 21) >> shifts_4th[1]; ind[2] = in[10] >> shifts_4th[2]; ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2256,8 +2256,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[12] >> shifts_5th[1]; ind[2] = (in[12] >> 30 | in[13] << 2) >> shifts_5th[2]; ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2266,8 +2266,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[15] >> shifts_6th[1]; ind[2] = (in[15] >> 26 | in[16] << 6) >> shifts_6th[2]; ind[3] = (in[16] >> 17 | in[17] << 15) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2276,8 +2276,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[17] >> 31 | in[18] << 1) >> shifts_7th[1]; ind[2] = (in[18] >> 22 | in[19] << 10) >> shifts_7th[2]; ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2286,8 +2286,8 @@ inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[20] >> 27 | in[21] << 5) >> shifts_8th[1]; ind[2] = (in[21] >> 18 | in[22] << 14) >> shifts_8th[2]; ind[3] = in[22] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2300,7 +2300,7 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out uint32_t mask = 0xffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 8}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2310,8 +2310,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[1]; ind[2] = (in[1] >> 16 | in[2] << 16) >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2320,8 +2320,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[1]; ind[2] = (in[4] >> 16 | in[5] << 16) >> shifts_1st[2]; ind[3] = in[5] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2330,8 +2330,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[1]; ind[2] = (in[7] >> 16 | in[8] << 16) >> shifts_1st[2]; ind[3] = in[8] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2340,8 +2340,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[1]; ind[2] = (in[10] >> 16 | in[11] << 16) >> shifts_1st[2]; ind[3] = in[11] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2350,8 +2350,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[12] >> 24 | in[13] << 8) >> shifts_1st[1]; ind[2] = (in[13] >> 16 | in[14] << 16) >> shifts_1st[2]; ind[3] = in[14] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2360,8 +2360,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[1]; ind[2] = (in[16] >> 16 | in[17] << 16) >> shifts_1st[2]; ind[3] = in[17] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2370,8 +2370,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[18] >> 24 | in[19] << 8) >> shifts_1st[1]; ind[2] = (in[19] >> 16 | in[20] << 16) >> shifts_1st[2]; ind[3] = in[20] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2380,8 +2380,8 @@ inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[21] >> 24 | in[22] << 8) >> shifts_1st[1]; ind[2] = (in[22] >> 16 | in[23] << 16) >> shifts_1st[2]; ind[3] = in[23] >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2401,7 +2401,7 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 0, 6, 0}; uint32_t shifts_7th[4] = {0, 0, 0, 3}; uint32_t shifts_8th[4] = {0, 0, 0, 7}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2411,8 +2411,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 25 | in[1] << 7) >> shifts_1st[1]; ind[2] = (in[1] >> 18 | in[2] << 14) >> shifts_1st[2]; ind[3] = (in[2] >> 11 | in[3] << 21) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2421,8 +2421,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_2nd[1]; ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[2]; ind[3] = (in[5] >> 15 | in[6] << 17) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2431,8 +2431,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[7] >> shifts_3rd[1]; ind[2] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[2]; ind[3] = (in[8] >> 19 | in[9] << 13) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2441,8 +2441,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[10] >> shifts_4th[1]; ind[2] = (in[10] >> 30 | in[11] << 2) >> shifts_4th[2]; ind[3] = (in[11] >> 23 | in[12] << 9) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2451,8 +2451,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[13] >> 9 | in[14] << 23) >> shifts_5th[1]; ind[2] = in[14] >> shifts_5th[2]; ind[3] = (in[14] >> 27 | in[15] << 5) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2461,8 +2461,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_6th[1]; ind[2] = in[17] >> shifts_6th[2]; ind[3] = (in[17] >> 31 | in[18] << 1) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2471,8 +2471,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[19] >> 17 | in[20] << 15) >> shifts_7th[1]; ind[2] = (in[20] >> 10 | in[21] << 22) >> shifts_7th[2]; ind[3] = in[21] >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2481,8 +2481,8 @@ inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_8th[1]; ind[2] = (in[23] >> 14 | in[24] << 18) >> shifts_8th[2]; ind[3] = in[24] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2498,7 +2498,7 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_2nd[4] = {0, 2, 0, 0}; uint32_t shifts_3rd[4] = {0, 0, 4, 0}; uint32_t shifts_4th[4] = {0, 0, 0, 6}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2508,8 +2508,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[1]; ind[2] = (in[1] >> 20 | in[2] << 12) >> shifts_1st[2]; ind[3] = (in[2] >> 14 | in[3] << 18) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2518,8 +2518,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[4] >> shifts_2nd[1]; ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[2]; ind[3] = (in[5] >> 22 | in[6] << 10) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2528,8 +2528,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 10 | in[8] << 22) >> shifts_3rd[1]; ind[2] = in[8] >> shifts_3rd[2]; ind[3] = (in[8] >> 30 | in[9] << 2) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2538,8 +2538,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[10] >> 18 | in[11] << 14) >> shifts_4th[1]; ind[2] = (in[11] >> 12 | in[12] << 20) >> shifts_4th[2]; ind[3] = in[12] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2548,8 +2548,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[13] >> 26 | in[14] << 6) >> shifts_1st[1]; ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_1st[2]; ind[3] = (in[15] >> 14 | in[16] << 18) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2558,8 +2558,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[17] >> shifts_2nd[1]; ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_2nd[2]; ind[3] = (in[18] >> 22 | in[19] << 10) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2568,8 +2568,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[20] >> 10 | in[21] << 22) >> shifts_3rd[1]; ind[2] = in[21] >> shifts_3rd[2]; ind[3] = (in[21] >> 30 | in[22] << 2) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2578,8 +2578,8 @@ inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[23] >> 18 | in[24] << 14) >> shifts_4th[1]; ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_4th[2]; ind[3] = in[25] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2599,7 +2599,7 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 0, 0, 0}; uint32_t shifts_7th[4] = {0, 3, 0, 0}; uint32_t shifts_8th[4] = {0, 0, 0, 5}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2609,8 +2609,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[1]; ind[2] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[2]; ind[3] = (in[2] >> 17 | in[3] << 15) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2619,8 +2619,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 7 | in[5] << 25) >> shifts_2nd[1]; ind[2] = in[5] >> shifts_2nd[2]; ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2629,8 +2629,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 19 | in[8] << 13) >> shifts_3rd[1]; ind[2] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[2]; ind[3] = (in[9] >> 9 | in[10] << 23) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2639,8 +2639,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[10] >> 31 | in[11] << 1) >> shifts_4th[1]; ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_4th[2]; ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2649,8 +2649,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[14] >> 11 | in[15] << 21) >> shifts_5th[1]; ind[2] = (in[15] >> 6 | in[16] << 26) >> shifts_5th[2]; ind[3] = in[16] >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2659,8 +2659,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[17] >> 23 | in[18] << 9) >> shifts_6th[1]; ind[2] = (in[18] >> 18 | in[19] << 14) >> shifts_6th[2]; ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2669,8 +2669,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[21] >> shifts_7th[1]; ind[2] = (in[21] >> 30 | in[22] << 2) >> shifts_7th[2]; ind[3] = (in[22] >> 25 | in[23] << 7) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2679,8 +2679,8 @@ inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[24] >> 15 | in[25] << 17) >> shifts_8th[1]; ind[2] = (in[25] >> 10 | in[26] << 22) >> shifts_8th[2]; ind[3] = in[26] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2694,7 +2694,7 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 0, 4}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2704,8 +2704,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[1]; ind[2] = (in[1] >> 24 | in[2] << 8) >> shifts_1st[2]; ind[3] = (in[2] >> 20 | in[3] << 12) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2714,8 +2714,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 12 | in[5] << 20) >> shifts_2nd[1]; ind[2] = (in[5] >> 8 | in[6] << 24) >> shifts_2nd[2]; ind[3] = in[6] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2724,8 +2724,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[1]; ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_1st[2]; ind[3] = (in[9] >> 20 | in[10] << 12) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2734,8 +2734,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[11] >> 12 | in[12] << 20) >> shifts_2nd[1]; ind[2] = (in[12] >> 8 | in[13] << 24) >> shifts_2nd[2]; ind[3] = in[13] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2744,8 +2744,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[14] >> 28 | in[15] << 4) >> shifts_1st[1]; ind[2] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[2]; ind[3] = (in[16] >> 20 | in[17] << 12) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2754,8 +2754,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[18] >> 12 | in[19] << 20) >> shifts_2nd[1]; ind[2] = (in[19] >> 8 | in[20] << 24) >> shifts_2nd[2]; ind[3] = in[20] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2764,8 +2764,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[21] >> 28 | in[22] << 4) >> shifts_1st[1]; ind[2] = (in[22] >> 24 | in[23] << 8) >> shifts_1st[2]; ind[3] = (in[23] >> 20 | in[24] << 12) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2774,8 +2774,8 @@ inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[25] >> 12 | in[26] << 20) >> shifts_2nd[1]; ind[2] = (in[26] >> 8 | in[27] << 24) >> shifts_2nd[2]; ind[3] = in[27] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2795,7 +2795,7 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_6th[4] = {0, 1, 0, 0}; uint32_t shifts_7th[4] = {0, 0, 0, 0}; uint32_t shifts_8th[4] = {0, 0, 0, 3}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2805,8 +2805,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 29 | in[1] << 3) >> shifts_1st[1]; ind[2] = (in[1] >> 26 | in[2] << 6) >> shifts_1st[2]; ind[3] = (in[2] >> 23 | in[3] << 9) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2815,8 +2815,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 17 | in[5] << 15) >> shifts_2nd[1]; ind[2] = (in[5] >> 14 | in[6] << 18) >> shifts_2nd[2]; ind[3] = (in[6] >> 11 | in[7] << 21) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2825,8 +2825,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[8] >> 5 | in[9] << 27) >> shifts_3rd[1]; ind[2] = in[9] >> shifts_3rd[2]; ind[3] = (in[9] >> 31 | in[10] << 1) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2835,8 +2835,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_4th[1]; ind[2] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[2]; ind[3] = (in[13] >> 19 | in[14] << 13) >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2845,8 +2845,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[15] >> 13 | in[16] << 19) >> shifts_5th[1]; ind[2] = (in[16] >> 10 | in[17] << 22) >> shifts_5th[2]; ind[3] = (in[17] >> 7 | in[18] << 25) >> shifts_5th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2855,8 +2855,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = in[19] >> shifts_6th[1]; ind[2] = (in[19] >> 30 | in[20] << 2) >> shifts_6th[2]; ind[3] = (in[20] >> 27 | in[21] << 5) >> shifts_6th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2865,8 +2865,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_7th[1]; ind[2] = (in[23] >> 18 | in[24] << 14) >> shifts_7th[2]; ind[3] = (in[24] >> 15 | in[25] << 17) >> shifts_7th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2875,8 +2875,8 @@ inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[26] >> 9 | in[27] << 23) >> shifts_8th[1]; ind[2] = (in[27] >> 6 | in[28] << 26) >> shifts_8th[2]; ind[3] = in[28] >> shifts_8th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2892,7 +2892,7 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out uint32_t shifts_2nd[4] = {0, 0, 0, 0}; uint32_t shifts_3rd[4] = {0, 0, 0, 0}; uint32_t shifts_4th[4] = {0, 0, 0, 2}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2902,8 +2902,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[1]; ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[2]; ind[3] = (in[2] >> 26 | in[3] << 6) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2912,8 +2912,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[1]; ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_2nd[2]; ind[3] = (in[6] >> 18 | in[7] << 14) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2922,8 +2922,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[1]; ind[2] = (in[9] >> 12 | in[10] << 20) >> shifts_3rd[2]; ind[3] = (in[10] >> 10 | in[11] << 22) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2932,8 +2932,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[12] >> 6 | in[13] << 26) >> shifts_4th[1]; ind[2] = (in[13] >> 4 | in[14] << 28) >> shifts_4th[2]; ind[3] = in[14] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2942,8 +2942,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[15] >> 30 | in[16] << 2) >> shifts_1st[1]; ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[2]; ind[3] = (in[17] >> 26 | in[18] << 6) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2952,8 +2952,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[19] >> 22 | in[20] << 10) >> shifts_2nd[1]; ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_2nd[2]; ind[3] = (in[21] >> 18 | in[22] << 14) >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2962,8 +2962,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[23] >> 14 | in[24] << 18) >> shifts_3rd[1]; ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_3rd[2]; ind[3] = (in[25] >> 10 | in[26] << 22) >> shifts_3rd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2972,8 +2972,8 @@ inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[27] >> 6 | in[28] << 26) >> shifts_4th[1]; ind[2] = (in[28] >> 4 | in[29] << 28) >> shifts_4th[2]; ind[3] = in[29] >> shifts_4th[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -2987,7 +2987,7 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 0, 1}; - uint32x4_t reg_shft, reg_masks; + uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); @@ -2997,8 +2997,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[0] >> 31 | in[1] << 1) >> shifts_1st[1]; ind[2] = (in[1] >> 30 | in[2] << 2) >> shifts_1st[2]; ind[3] = (in[2] >> 29 | in[3] << 3) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3007,8 +3007,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[4] >> 27 | in[5] << 5) >> shifts_1st[1]; ind[2] = (in[5] >> 26 | in[6] << 6) >> shifts_1st[2]; ind[3] = (in[6] >> 25 | in[7] << 7) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3017,8 +3017,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[8] >> 23 | in[9] << 9) >> shifts_1st[1]; ind[2] = (in[9] >> 22 | in[10] << 10) >> shifts_1st[2]; ind[3] = (in[10] >> 21 | in[11] << 11) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3027,8 +3027,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[12] >> 19 | in[13] << 13) >> shifts_1st[1]; ind[2] = (in[13] >> 18 | in[14] << 14) >> shifts_1st[2]; ind[3] = (in[14] >> 17 | in[15] << 15) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3037,8 +3037,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[16] >> 15 | in[17] << 17) >> shifts_1st[1]; ind[2] = (in[17] >> 14 | in[18] << 18) >> shifts_1st[2]; ind[3] = (in[18] >> 13 | in[19] << 19) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3047,8 +3047,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[20] >> 11 | in[21] << 21) >> shifts_1st[1]; ind[2] = (in[21] >> 10 | in[22] << 22) >> shifts_1st[2]; ind[3] = (in[22] >> 9 | in[23] << 23) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3057,8 +3057,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[24] >> 7 | in[25] << 25) >> shifts_1st[1]; ind[2] = (in[25] >> 6 | in[26] << 26) >> shifts_1st[2]; ind[3] = (in[26] >> 5 | in[27] << 27) >> shifts_1st[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; @@ -3067,8 +3067,8 @@ inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out ind[1] = (in[28] >> 3 | in[29] << 29) >> shifts_2nd[1]; ind[2] = (in[29] >> 2 | in[30] << 30) >> shifts_2nd[2]; ind[3] = in[30] >> shifts_2nd[3]; - reg_shft = vld1q_u32(ind); - results = vandq_u32(reg_shft, reg_masks); + reg_shift = vld1q_u32(ind); + results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; diff --git a/go/parquet/internal/utils/bit_reader.go b/go/parquet/internal/utils/bit_reader.go index bf9741c79878b..0bf501e0488cf 100644 --- a/go/parquet/internal/utils/bit_reader.go +++ b/go/parquet/internal/utils/bit_reader.go @@ -152,7 +152,7 @@ func (b *BitReader) GetAligned(nbytes int, v interface{}) bool { if n != nbytes { return false } - // zero pad the the bytes + // zero pad the bytes memory.Set(b.raw[n:typBytes], 0) switch v := v.(type) { @@ -215,7 +215,7 @@ func (b *BitReader) GetBatchIndex(bits uint, out []IndexType) (i int, err error) var val uint64 length := len(out) - // if we're not currently byte-aligned, read bits until we are byte-aligned. + // if we aren't currently byte-aligned, read bits until we are byte-aligned. for ; i < length && b.bitoffset != 0; i++ { val, err = b.next(bits) out[i] = IndexType(val) diff --git a/go/parquet/internal/utils/rle.go b/go/parquet/internal/utils/rle.go index 21ac2ce6b5db5..f367e7dc13cee 100644 --- a/go/parquet/internal/utils/rle.go +++ b/go/parquet/internal/utils/rle.go @@ -81,7 +81,7 @@ func MaxRLEBufferSize(width, numValues int) int { // on a byte boundary without padding. // Given that we know it is a multiple of 8, we store the number of 8-groups rather than // the actual number of encoded ints. (This means that the total number of encoded values -// can not be determined from the encoded data, since the number of values in the last +// cannot be determined from the encoded data, since the number of values in the last // group may not be a multiple of 8). For the last group of literal runs, we pad // the group to 8 with zeros. This allows for 8 at a time decoding on the read side // without the need for additional checks. @@ -479,7 +479,7 @@ func (r *RleEncoder) Flush() int { if r.repCount > 0 && allRep { r.flushRepeated() } else { - // buffer the last grou pof literals to 8 by padding with 0s + // buffer the last group of literals to 8 by padding with 0s for len(r.buffer) != 0 && len(r.buffer) < 8 { r.buffer = append(r.buffer, 0) } diff --git a/go/parquet/metadata/file.go b/go/parquet/metadata/file.go index 3335140c2e1c8..fbbbae1eef892 100644 --- a/go/parquet/metadata/file.go +++ b/go/parquet/metadata/file.go @@ -119,7 +119,7 @@ func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) { createdBy := f.props.CreatedBy() f.metadata.CreatedBy = &createdBy - // Users cannot set the `ColumnOrder` since we do not not have user defined sort order + // Users cannot set the `ColumnOrder` since we do not have user defined sort order // in the spec yet. // // We always default to `TYPE_DEFINED_ORDER`. We can expose it in @@ -401,7 +401,7 @@ func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata { // Panics if f.FileDecryptor is nil func (f *FileMetaData) VerifySignature(signature []byte) bool { if f.FileDecryptor == nil { - panic("decryption not set propertly, cannot verify signature") + panic("decryption not set properly, cannot verify signature") } serializer := thrift.NewThriftSerializer() @@ -472,7 +472,7 @@ func (f *FileMetaData) Version() parquet.Version { case 2: return parquet.V2_LATEST default: - // imporperly set version, assume parquet 1.0 + // improperly set version, assume parquet 1.0 return parquet.V1_0 } } diff --git a/go/parquet/metadata/statistics.go b/go/parquet/metadata/statistics.go index 43294272dec35..a9bda405bb9b5 100644 --- a/go/parquet/metadata/statistics.go +++ b/go/parquet/metadata/statistics.go @@ -271,7 +271,7 @@ func signedByteLess(a, b []byte) bool { sa := *(*[]int8)(unsafe.Pointer(&a)) sb := *(*[]int8)(unsafe.Pointer(&b)) - // we can short circuit for different signd numbers or for equal length byte + // we can short circuit for different signed numbers or for equal length byte // arrays that have different first bytes. The equality requirement is necessary // for sign extension cases. 0xFF10 should be equal to 0x10 (due to big endian sign extension) if int8(0x80&uint8(sa[0])) != int8(0x80&uint8(sb[0])) || (len(sa) == len(sb) && sa[0] != sb[0]) { diff --git a/go/parquet/metadata/statistics_types.gen.go b/go/parquet/metadata/statistics_types.gen.go index baecd185d14fc..a0e2949251368 100644 --- a/go/parquet/metadata/statistics_types.gen.go +++ b/go/parquet/metadata/statistics_types.gen.go @@ -67,7 +67,7 @@ func NewInt32Statistics(descr *schema.Column, mem memory.Allocator) *Int32Statis } } -// NewInt32StatisticsFromEncoded will construct a propertly typed statistics object +// NewInt32StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewInt32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int32Statistics { ret := NewInt32Statistics(descr, mem) @@ -367,7 +367,7 @@ func NewInt64Statistics(descr *schema.Column, mem memory.Allocator) *Int64Statis } } -// NewInt64StatisticsFromEncoded will construct a propertly typed statistics object +// NewInt64StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewInt64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int64Statistics { ret := NewInt64Statistics(descr, mem) @@ -667,7 +667,7 @@ func NewInt96Statistics(descr *schema.Column, mem memory.Allocator) *Int96Statis } } -// NewInt96StatisticsFromEncoded will construct a propertly typed statistics object +// NewInt96StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewInt96StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Int96Statistics { ret := NewInt96Statistics(descr, mem) @@ -952,7 +952,7 @@ func NewFloat32Statistics(descr *schema.Column, mem memory.Allocator) *Float32St } } -// NewFloat32StatisticsFromEncoded will construct a propertly typed statistics object +// NewFloat32StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFloat32StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float32Statistics { ret := NewFloat32Statistics(descr, mem) @@ -1251,7 +1251,7 @@ func NewFloat64Statistics(descr *schema.Column, mem memory.Allocator) *Float64St } } -// NewFloat64StatisticsFromEncoded will construct a propertly typed statistics object +// NewFloat64StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFloat64StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float64Statistics { ret := NewFloat64Statistics(descr, mem) @@ -1550,7 +1550,7 @@ func NewBooleanStatistics(descr *schema.Column, mem memory.Allocator) *BooleanSt } } -// NewBooleanStatisticsFromEncoded will construct a propertly typed statistics object +// NewBooleanStatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewBooleanStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *BooleanStatistics { ret := NewBooleanStatistics(descr, mem) @@ -1838,7 +1838,7 @@ func NewByteArrayStatistics(descr *schema.Column, mem memory.Allocator) *ByteArr } } -// NewByteArrayStatisticsFromEncoded will construct a propertly typed statistics object +// NewByteArrayStatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *ByteArrayStatistics { ret := NewByteArrayStatistics(descr, mem) @@ -2149,7 +2149,7 @@ func NewFixedLenByteArrayStatistics(descr *schema.Column, mem memory.Allocator) } } -// NewFixedLenByteArrayStatisticsFromEncoded will construct a propertly typed statistics object +// NewFixedLenByteArrayStatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFixedLenByteArrayStatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *FixedLenByteArrayStatistics { ret := NewFixedLenByteArrayStatistics(descr, mem) @@ -2470,7 +2470,7 @@ func NewFloat16Statistics(descr *schema.Column, mem memory.Allocator) *Float16St } } -// NewFloat16StatisticsFromEncoded will construct a propertly typed statistics object +// NewFloat16StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func NewFloat16StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *Float16Statistics { ret := NewFloat16Statistics(descr, mem) diff --git a/go/parquet/metadata/statistics_types.gen.go.tmpl b/go/parquet/metadata/statistics_types.gen.go.tmpl index 93495527c7e54..26fe7f1531999 100644 --- a/go/parquet/metadata/statistics_types.gen.go.tmpl +++ b/go/parquet/metadata/statistics_types.gen.go.tmpl @@ -74,7 +74,7 @@ func New{{.Name}}Statistics(descr *schema.Column, mem memory.Allocator) *{{.Name } } -// New{{.Name}}StatisticsFromEncoded will construct a propertly typed statistics object +// New{{.Name}}StatisticsFromEncoded will construct a properly typed statistics object // initializing it with the provided information. func New{{.Name}}StatisticsFromEncoded(descr *schema.Column, mem memory.Allocator, nvalues int64, encoded StatProvider) *{{.Name}}Statistics { ret := New{{.Name}}Statistics(descr, mem) diff --git a/go/parquet/pqarrow/column_readers.go b/go/parquet/pqarrow/column_readers.go index 49f3fac0a3b7c..3c38aba5c32a6 100644 --- a/go/parquet/pqarrow/column_readers.go +++ b/go/parquet/pqarrow/column_readers.go @@ -895,7 +895,7 @@ func transferDecimalBytes(rdr file.BinaryRecordReader, dt arrow.DataType) (*arro rec := in.Value(i) if len(rec) <= 0 { - return nil, fmt.Errorf("invalud BYTEARRAY length for type: %s", dt) + return nil, fmt.Errorf("invalid BYTEARRAY length for type: %s", dt) } out[i], err = bigEndianToDecimal128(rec) if err != nil { diff --git a/go/parquet/pqarrow/encode_arrow.go b/go/parquet/pqarrow/encode_arrow.go index 8926d0ba51a07..0836d135243da 100644 --- a/go/parquet/pqarrow/encode_arrow.go +++ b/go/parquet/pqarrow/encode_arrow.go @@ -141,7 +141,7 @@ func newArrowColumnWriter(data *arrow.Chunked, offset, size int64, manifest *Sch return arrowColumnWriter{}, nil } if leafCount != bldr.leafCount() { - return arrowColumnWriter{}, fmt.Errorf("data type leaf_count != builder leafcount: %d - %d", leafCount, bldr.leafCount()) + return arrowColumnWriter{}, fmt.Errorf("data type leaf_count != builder leaf_count: %d - %d", leafCount, bldr.leafCount()) } builders = append(builders, bldr) } @@ -392,7 +392,7 @@ func writeDenseArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr return err } } else { - // no data conversion neccessary + // no data conversion necessary if leafArr.Data().Buffers()[1] != nil { data = arrow.Int64Traits.CastFromBytes(leafArr.Data().Buffers()[1].Bytes()) data = data[leafArr.Data().Offset() : leafArr.Data().Offset()+leafArr.Len()] diff --git a/go/parquet/pqarrow/file_reader.go b/go/parquet/pqarrow/file_reader.go index 3534cc87b78b5..a55dbe46e07fa 100755 --- a/go/parquet/pqarrow/file_reader.go +++ b/go/parquet/pqarrow/file_reader.go @@ -293,7 +293,7 @@ type resultPair struct { err error } -//! This is Super complicated. I would simpify the pattern, but it works and hesitant to change what works. +//! This is Super complicated. I would simplify the pattern, but it works and hesitant to change what works. // ReadRowGroups is for generating an array.Table from the file but filtering to only read the requested // columns and row groups rather than the entire file which ReadTable does. @@ -363,7 +363,7 @@ func (fr *FileReader) ReadRowGroups(ctx context.Context, indices, rowGroups []in close(ch) // output slice of columns - columns := make([]arrow.Column, len(sc.Fields())) + columns := make([]arrow.Column, sc.NumFields()) defer releaseColumns(columns) for data := range results { if data.err != nil { diff --git a/go/parquet/pqarrow/file_writer.go b/go/parquet/pqarrow/file_writer.go index bc484ba243f87..1164cd690c399 100644 --- a/go/parquet/pqarrow/file_writer.go +++ b/go/parquet/pqarrow/file_writer.go @@ -134,6 +134,23 @@ func (fw *FileWriter) RowGroupTotalBytesWritten() int64 { return 0 } +// RowGroupNumRows returns the number of rows written to the current row group. +// Returns an error if they are unequal between columns that have been written so far. +func (fw *FileWriter) RowGroupNumRows() (int, error) { + if fw.rgw != nil { + return fw.rgw.NumRows() + } + return 0, nil +} + +// NumRows returns the total number of rows that have been written so far. +func (fw *FileWriter) NumRows() int { + if fw.wr != nil { + return fw.wr.NumRows() + } + return 0 +} + // WriteBuffered will either append to an existing row group or create a new one // based on the record length and max row group length. // diff --git a/go/parquet/pqarrow/file_writer_test.go b/go/parquet/pqarrow/file_writer_test.go new file mode 100644 index 0000000000000..0b76733a62876 --- /dev/null +++ b/go/parquet/pqarrow/file_writer_test.go @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pqarrow_test + +import ( + "bytes" + "strings" + "testing" + + "github.com/apache/arrow/go/v15/arrow" + "github.com/apache/arrow/go/v15/arrow/array" + "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v15/parquet" + "github.com/apache/arrow/go/v15/parquet/pqarrow" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFileWriterRowGroupNumRows(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "one", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + {Name: "two", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + }, nil) + + data := `[ + {"one": 1, "two": 2}, + {"one": 1, "two": null}, + {"one": null, "two": 2}, + {"one": null, "two": null} + ]` + record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data)) + require.NoError(t, err) + + output := &bytes.Buffer{} + writerProps := parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(100)) + writer, err := pqarrow.NewFileWriter(schema, output, writerProps, pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.Write(record)) + numRows, err := writer.RowGroupNumRows() + require.NoError(t, err) + assert.Equal(t, 4, numRows) + require.NoError(t, writer.Close()) +} + +func TestFileWriterNumRows(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "one", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + {Name: "two", Nullable: true, Type: arrow.PrimitiveTypes.Float64}, + }, nil) + + data := `[ + {"one": 1, "two": 2}, + {"one": 1, "two": null}, + {"one": null, "two": 2}, + {"one": null, "two": null} + ]` + record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(data)) + require.NoError(t, err) + + maxRowGroupLength := 2 + + output := &bytes.Buffer{} + writerProps := parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(int64(maxRowGroupLength))) + writer, err := pqarrow.NewFileWriter(schema, output, writerProps, pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.Write(record)) + rowGroupNumRows, err := writer.RowGroupNumRows() + require.NoError(t, err) + assert.Equal(t, maxRowGroupLength, rowGroupNumRows) + + require.NoError(t, writer.Close()) + assert.Equal(t, 4, writer.NumRows()) +} diff --git a/go/parquet/pqarrow/path_builder.go b/go/parquet/pqarrow/path_builder.go index 57a077956edea..6b94205f5dcc8 100644 --- a/go/parquet/pqarrow/path_builder.go +++ b/go/parquet/pqarrow/path_builder.go @@ -665,7 +665,7 @@ func fillRepLevels(count int, repLvl int16, ctx *pathWriteCtx) { fillCount := count // this condition occurs (rep and def levels equals), in one of a few cases: - // 1. before any list is encounted + // 1. before any list is encountered // 2. after rep-level has been filled in due to null/empty values above // 3. after finishing a list if !ctx.equalRepDeflevlsLen() { diff --git a/go/parquet/pqarrow/schema.go b/go/parquet/pqarrow/schema.go index 95c477c78b87d..ccb3dc0350ae1 100644 --- a/go/parquet/pqarrow/schema.go +++ b/go/parquet/pqarrow/schema.go @@ -174,7 +174,7 @@ func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version()) } } else if target == arrow.Second { - return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestampts to millis, micros or nanos", props.Version()) + return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis, micros or nanos", props.Version()) } return physical, logicalType, nil } @@ -233,11 +233,11 @@ func repFromNullable(isnullable bool) parquet.Repetition { } func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { - if len(typ.Fields()) == 0 { + if typ.NumFields() == 0 { return nil, fmt.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name) } - children := make(schema.FieldList, 0, len(typ.Fields())) + children := make(schema.FieldList, 0, typ.NumFields()) for _, f := range typ.Fields() { n, err := fieldToNode(f.Name, f, props, arrprops) if err != nil { @@ -440,7 +440,7 @@ func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops Arrow props = parquet.NewWriterProperties() } - nodes := make(schema.FieldList, 0, len(sc.Fields())) + nodes := make(schema.FieldList, 0, sc.NumFields()) for _, f := range sc.Fields() { n, err := fieldToNode(f.Name, f, props, arrprops) if err != nil { @@ -1002,7 +1002,7 @@ func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (mo err = xerrors.New("unimplemented type") case arrow.STRUCT: typ := origin.Type.(*arrow.StructType) - if nchildren != len(typ.Fields()) { + if nchildren != typ.NumFields() { return } diff --git a/go/parquet/pqarrow/schema_test.go b/go/parquet/pqarrow/schema_test.go index ee5aad8913470..bc33663414075 100644 --- a/go/parquet/pqarrow/schema_test.go +++ b/go/parquet/pqarrow/schema_test.go @@ -386,7 +386,7 @@ func TestListStructBackwardCompatible(t *testing.T) { // } // } // - // Instaed of the proper 3-level encoding which would be: + // Instead of the proper 3-level encoding which would be: // // repeated group field_id=-1 schema { // optional group field_id=-1 answers (List) { diff --git a/go/parquet/schema/logical_types_test.go b/go/parquet/schema/logical_types_test.go index c371b47714f41..0fd91daf8d668 100644 --- a/go/parquet/schema/logical_types_test.go +++ b/go/parquet/schema/logical_types_test.go @@ -89,10 +89,10 @@ func TestConvertedTypeCompatibility(t *testing.T) { {"list", schema.NewListLogicalType(), schema.ConvertedTypes.List}, {"enum", schema.EnumLogicalType{}, schema.ConvertedTypes.Enum}, {"date", schema.DateLogicalType{}, schema.ConvertedTypes.Date}, - {"time_milli", schema.NewTimeLogicalType(true /* adjutedToUTC */, schema.TimeUnitMillis), schema.ConvertedTypes.TimeMillis}, - {"time_micro", schema.NewTimeLogicalType(true /* adjutedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimeMicros}, - {"timestamp_milli", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitMillis), schema.ConvertedTypes.TimestampMillis}, - {"timestamp_micro", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimestampMicros}, + {"time_milli", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), schema.ConvertedTypes.TimeMillis}, + {"time_micro", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimeMicros}, + {"timestamp_milli", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), schema.ConvertedTypes.TimestampMillis}, + {"timestamp_micro", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), schema.ConvertedTypes.TimestampMicros}, {"timestamp_milli_opts", schema.NewTimestampLogicalTypeWithOpts(schema.WithTSIsAdjustedToUTC(), schema.WithTSTimeUnitType(schema.TimeUnitMillis)), schema.ConvertedTypes.TimestampMillis}, {"uint8", schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint8}, {"uint16", schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), schema.ConvertedTypes.Uint16}, @@ -160,12 +160,12 @@ func TestNewTypeIncompatibility(t *testing.T) { {"uuid", schema.UUIDLogicalType{}, schema.UUIDLogicalType{}}, {"float16", schema.Float16LogicalType{}, schema.Float16LogicalType{}}, {"null", schema.NullLogicalType{}, schema.NullLogicalType{}}, - {"not-utc-time_milli", schema.NewTimeLogicalType(false /* adjutedToUTC */, schema.TimeUnitMillis), &schema.TimeLogicalType{}}, - {"not-utc-time-micro", schema.NewTimeLogicalType(false /* adjutedToUTC */, schema.TimeUnitMicros), &schema.TimeLogicalType{}}, - {"not-utc-time-nano", schema.NewTimeLogicalType(false /* adjutedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}}, - {"utc-time-nano", schema.NewTimeLogicalType(true /* adjutedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}}, - {"not-utc-timestamp-nano", schema.NewTimestampLogicalType(false /* adjutedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}}, - {"utc-timestamp-nano", schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}}, + {"not-utc-time_milli", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimeLogicalType{}}, + {"not-utc-time-micro", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimeLogicalType{}}, + {"not-utc-time-nano", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}}, + {"utc-time-nano", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}}, + {"not-utc-timestamp-nano", schema.NewTimestampLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}}, + {"utc-timestamp-nano", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}}, } for _, tt := range tests { @@ -184,8 +184,8 @@ func TestFactoryPanic(t *testing.T) { name string f func() }{ - {"invalid TimeUnit", func() { schema.NewTimeLogicalType(true /* adjutedToUTC */, schema.TimeUnitUnknown) }}, - {"invalid timestamp unit", func() { schema.NewTimestampLogicalType(true /* adjutedToUTC */, schema.TimeUnitUnknown) }}, + {"invalid TimeUnit", func() { schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitUnknown) }}, + {"invalid timestamp unit", func() { schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitUnknown) }}, {"negative bitwidth", func() { schema.NewIntLogicalType(-1 /* bitWidth */, false /* signed */) }}, {"zero bitwidth", func() { schema.NewIntLogicalType(0 /* bitWidth */, false /* signed */) }}, {"bitwidth one", func() { schema.NewIntLogicalType(1 /* bitWidth */, false /* signed */) }}, diff --git a/go/parquet/schema/node.go b/go/parquet/schema/node.go index ff23624afa35d..c1b325eb90183 100644 --- a/go/parquet/schema/node.go +++ b/go/parquet/schema/node.go @@ -148,7 +148,7 @@ type PrimitiveNode struct { decimalMetaData DecimalMetadata } -// NewPrimitiveNodeLogical constructs a Primtive node using the provided logical type for a given +// NewPrimitiveNodeLogical constructs a Primitive node using the provided logical type for a given // physical type and typelength. func NewPrimitiveNodeLogical(name string, repetition parquet.Repetition, logicalType LogicalType, physicalType parquet.Type, typeLen int, id int32) (*PrimitiveNode, error) { n := &PrimitiveNode{ @@ -165,7 +165,7 @@ func NewPrimitiveNodeLogical(name string, repetition parquet.Repetition, logical return nil, fmt.Errorf("%s cannot be applied to primitive type %s", logicalType, physicalType) } } else { - return nil, fmt.Errorf("nested logical type %s can not be applied to a non-group node", logicalType) + return nil, fmt.Errorf("nested logical type %s cannot be applied to a non-group node", logicalType) } } else { n.logicalType = NoLogicalType{} @@ -373,7 +373,7 @@ type FieldList []Node // Len is equivalent to len(fieldlist) func (f FieldList) Len() int { return len(f) } -// GroupNode is for mananging nested nodes like List, Map, etc. +// GroupNode is for managing nested nodes like List, Map, etc. type GroupNode struct { node fields FieldList diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go index c0c8e0533efb0..7f8b337795592 100644 --- a/go/parquet/schema/reflection.go +++ b/go/parquet/schema/reflection.go @@ -314,7 +314,7 @@ func infoFromTags(f reflect.StructTag) *taggedInfo { return nil } -// typeToNode recurseively converts a physical type and the tag info into parquet Nodes +// typeToNode recursively converts a physical type and the tag info into parquet Nodes // // to avoid having to propagate errors up potentially high numbers of recursive calls // we use panics and then recover in the public function NewSchemaFromStruct so that a @@ -639,7 +639,7 @@ func typeFromNode(n Node) reflect.Type { switch n.Type() { case Primitive: typ := parquetTypeToReflect[n.(*PrimitiveNode).PhysicalType()] - // if a bytearray field is annoted as a String logical type or a UTF8 converted type + // if a bytearray field is annotated as a String logical type or a UTF8 converted type // then use a string instead of parquet.ByteArray / parquet.FixedLenByteArray which are []byte if n.LogicalType().Equals(StringLogicalType{}) || n.ConvertedType() == ConvertedTypes.UTF8 { typ = reflect.TypeOf(string("")) diff --git a/go/parquet/schema/schema_test.go b/go/parquet/schema/schema_test.go index cc43c3856d68e..232a386a25470 100644 --- a/go/parquet/schema/schema_test.go +++ b/go/parquet/schema/schema_test.go @@ -380,7 +380,7 @@ func (s *SchemaConverterSuite) TestZeroColumns() { func (s *SchemaConverterSuite) TestInvalidRoot() { // According to the Parquet spec, the first element in the list // is a group whose children (and their descendants) contain all of the rest of - // the flattened schema elments. If the first element is not a group, it is malformed + // the flattened schema elements. If the first element is not a group, it is malformed elements := []*format.SchemaElement{NewPrimitive("not-a-group" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 0 /* fieldID */), format.NewSchemaElement()} s.Panics(func() { s.convert(elements) }) diff --git a/go/parquet/types.go b/go/parquet/types.go index 5b6f8fb0251d9..0020a079d94a9 100644 --- a/go/parquet/types.go +++ b/go/parquet/types.go @@ -295,7 +295,7 @@ var ( Double Type ByteArray Type FixedLenByteArray Type - // this only exists as a convienence so we can denote it when necessary + // this only exists as a convenience so we can denote it when necessary // nearly all functions that take a parquet.Type will error/panic if given // Undefined Undefined Type diff --git a/java/README.md b/java/README.md index 816c8c929a107..69d1255a861e6 100644 --- a/java/README.md +++ b/java/README.md @@ -48,10 +48,10 @@ a version of your choosing. ```bash $ flatc --version -flatc version 1.12.0 +flatc version 23.5.26 $ grep "dep.fbs.version" java/pom.xml - 1.12.0 + 23.5.26 ``` 2. Generate the flatbuffer java files by performing the following: diff --git a/java/algorithm/pom.xml b/java/algorithm/pom.xml index 8c6a9fb0151ef..3e32d955ec417 100644 --- a/java/algorithm/pom.xml +++ b/java/algorithm/pom.xml @@ -31,6 +31,7 @@ arrow-vector ${project.version} test-jar + test org.apache.arrow diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index c1ceb93cb40c0..fe070400ad94f 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -476,6 +476,13 @@ public void testUInt8Vector() { } } + @Test + public void testNullVector() { + try (final NullVector vector = new NullVector("v", 1024)) { + assertTrue(roundtrip(vector, NullVector.class)); + } + } + @Test public void testVarBinaryVector() { try (final VarBinaryVector vector = new VarBinaryVector("v", allocator)) { diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 7b69179053d7f..8f41d2b65b7d1 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -54,10 +54,6 @@ io.grpc grpc-core - - io.grpc - grpc-context - io.grpc grpc-protobuf @@ -149,7 +145,13 @@ org.apache.maven.plugins maven-shade-plugin - 3.1.1 + + 3.2.4 shade-main diff --git a/java/flight/flight-grpc/pom.xml b/java/flight/flight-grpc/pom.xml index e7bb9508d2403..af765f8c436be 100644 --- a/java/flight/flight-grpc/pom.xml +++ b/java/flight/flight-grpc/pom.xml @@ -48,12 +48,13 @@ io.grpc - grpc-core + grpc-stub + + + io.grpc + grpc-inprocess + test - - io.grpc - grpc-stub - org.apache.arrow arrow-memory-core diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcFlightStreamResultSet.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcFlightStreamResultSet.java index e23267ebe9ebf..f69681d77a585 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcFlightStreamResultSet.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/ArrowFlightJdbcFlightStreamResultSet.java @@ -48,6 +48,7 @@ public final class ArrowFlightJdbcFlightStreamResultSet extends ArrowFlightJdbcVectorSchemaRootResultSet { private final ArrowFlightConnection connection; + private final FlightInfo flightInfo; private CloseableEndpointStreamPair currentEndpointData; private FlightEndpointDataQueue flightEndpointDataQueue; @@ -56,6 +57,9 @@ public final class ArrowFlightJdbcFlightStreamResultSet private Schema schema; + /** + * Public constructor used by ArrowFlightJdbcFactory. + */ ArrowFlightJdbcFlightStreamResultSet(final AvaticaStatement statement, final QueryState state, final Meta.Signature signature, @@ -64,20 +68,28 @@ public final class ArrowFlightJdbcFlightStreamResultSet final Meta.Frame firstFrame) throws SQLException { super(statement, state, signature, resultSetMetaData, timeZone, firstFrame); this.connection = (ArrowFlightConnection) statement.connection; + this.flightInfo = ((ArrowFlightInfoStatement) statement).executeFlightInfoQuery(); } - ArrowFlightJdbcFlightStreamResultSet(final ArrowFlightConnection connection, - final QueryState state, - final Meta.Signature signature, - final ResultSetMetaData resultSetMetaData, - final TimeZone timeZone, - final Meta.Frame firstFrame) throws SQLException { + /** + * Private constructor for fromFlightInfo. + */ + private ArrowFlightJdbcFlightStreamResultSet(final ArrowFlightConnection connection, + final QueryState state, + final Meta.Signature signature, + final ResultSetMetaData resultSetMetaData, + final TimeZone timeZone, + final Meta.Frame firstFrame, + final FlightInfo flightInfo + ) throws SQLException { super(null, state, signature, resultSetMetaData, timeZone, firstFrame); this.connection = connection; + this.flightInfo = flightInfo; } /** - * Create a {@link ResultSet} which pulls data from given {@link FlightInfo}. + * Create a {@link ResultSet} which pulls data from given {@link FlightInfo}. This is used to fetch result sets + * from DatabaseMetadata calls and skips the Avatica factory. * * @param connection The connection linked to the returned ResultSet. * @param flightInfo The FlightInfo from which data will be iterated by the returned ResultSet. @@ -99,11 +111,11 @@ static ArrowFlightJdbcFlightStreamResultSet fromFlightInfo( new AvaticaResultSetMetaData(null, null, signature); final ArrowFlightJdbcFlightStreamResultSet resultSet = new ArrowFlightJdbcFlightStreamResultSet(connection, state, signature, resultSetMetaData, - timeZone, null); + timeZone, null, flightInfo); resultSet.transformer = transformer; - resultSet.populateData(flightInfo); + resultSet.populateData(); return resultSet; } @@ -121,16 +133,14 @@ private void loadNewFlightStream() throws SQLException { @Override protected AvaticaResultSet execute() throws SQLException { - final FlightInfo flightInfo = ((ArrowFlightInfoStatement) statement).executeFlightInfoQuery(); - if (flightInfo != null) { schema = flightInfo.getSchemaOptional().orElse(null); - populateData(flightInfo); + populateData(); } return this; } - private void populateData(final FlightInfo flightInfo) throws SQLException { + private void populateData() throws SQLException { loadNewQueue(); flightEndpointDataQueue.enqueue(connection.getClientHandler().getStreams(flightInfo)); loadNewFlightStream(); @@ -157,6 +167,13 @@ private void populateDataForCurrentFlightStream() throws SQLException { populateData(currentVectorSchemaRoot, schema); } + /** + * Expose appMetadata associated with the underlying FlightInfo for this ResultSet. + */ + public byte[] getAppMetadata() { + return flightInfo.getAppMetadata(); + } + @Override public boolean next() throws SQLException { if (currentVectorSchemaRoot == null) { diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java index 60231a2460286..1525bcaaf51b1 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/FixedSizeListAvaticaParameterConverter.java @@ -17,7 +17,11 @@ package org.apache.arrow.driver.jdbc.converter.impl; +import java.util.List; + +import org.apache.arrow.driver.jdbc.utils.AvaticaParameterBinder; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.calcite.avatica.AvaticaParameter; @@ -33,6 +37,41 @@ public FixedSizeListAvaticaParameterConverter(ArrowType.FixedSizeList type) { @Override public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + final List values = (List) typedValue.value; + final int arraySize = values.size(); + + if (vector instanceof FixedSizeListVector) { + FixedSizeListVector listVector = ((FixedSizeListVector) vector); + FieldVector childVector = listVector.getDataVector(); + int maxArraySize = listVector.getListSize(); + + if (arraySize != maxArraySize) { + if (!childVector.getField().isNullable()) { + throw new UnsupportedOperationException("Each array must contain " + maxArraySize + " elements"); + } else if (arraySize > maxArraySize) { + throw new UnsupportedOperationException("Each array must contain at most " + maxArraySize + " elements"); + } + } + + int startPos = listVector.startNewValue(index); + for (int i = 0; i < arraySize; i++) { + Object val = values.get(i); + int childIndex = startPos + i; + if (val == null) { + if (childVector.getField().isNullable()) { + childVector.setNull(childIndex); + } else { + throw new UnsupportedOperationException("Can't set null on non-nullable child list"); + } + } else { + childVector.getField().getType().accept( + new AvaticaParameterBinder.BinderVisitor( + childVector, TypedValue.ofSerial(typedValue.componentType, val), childIndex)); + } + } + listVector.setValueCount(index + 1); + return true; + } return false; } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java index 6ef6920474860..a20747693e35a 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/LargeListAvaticaParameterConverter.java @@ -17,7 +17,12 @@ package org.apache.arrow.driver.jdbc.converter.impl; +import java.util.List; + +import org.apache.arrow.driver.jdbc.utils.AvaticaParameterBinder; +import org.apache.arrow.memory.util.LargeMemoryUtil; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.calcite.avatica.AvaticaParameter; @@ -33,6 +38,32 @@ public LargeListAvaticaParameterConverter(ArrowType.LargeList type) { @Override public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + final List values = (List) typedValue.value; + + if (vector instanceof LargeListVector) { + LargeListVector listVector = ((LargeListVector) vector); + FieldVector childVector = listVector.getDataVector(); + + long startPos = listVector.startNewValue(index); + for (int i = 0; i < values.size(); i++) { + Object val = values.get(i); + int childIndex = LargeMemoryUtil.checkedCastToInt(startPos) + i; + if (val == null) { + if (childVector.getField().isNullable()) { + childVector.setNull(childIndex); + } else { + throw new UnsupportedOperationException("Can't set null on non-nullable child list"); + } + } else { + childVector.getField().getType().accept( + new AvaticaParameterBinder.BinderVisitor( + childVector, TypedValue.ofSerial(typedValue.componentType, val), childIndex)); + } + } + listVector.endValue(index, values.size()); + listVector.setValueCount(index + 1); + return true; + } return false; } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java index aec59cb4d428e..f6cb9f3be2a4c 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/ListAvaticaParameterConverter.java @@ -17,7 +17,11 @@ package org.apache.arrow.driver.jdbc.converter.impl; +import java.util.List; + +import org.apache.arrow.driver.jdbc.utils.AvaticaParameterBinder; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.calcite.avatica.AvaticaParameter; @@ -33,6 +37,32 @@ public ListAvaticaParameterConverter(ArrowType.List type) { @Override public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + final List values = (List) typedValue.value; + + if (vector instanceof ListVector) { + ListVector listVector = ((ListVector) vector); + FieldVector childVector = listVector.getDataVector(); + + int startPos = listVector.startNewValue(index); + for (int i = 0; i < values.size(); i++) { + Object val = values.get(i); + int childIndex = startPos + i; + if (val == null) { + if (childVector.getField().isNullable()) { + childVector.setNull(childIndex); + } else { + throw new UnsupportedOperationException("Can't set null on non-nullable child list"); + } + } else { + childVector.getField().getType().accept( + new AvaticaParameterBinder.BinderVisitor( + childVector, TypedValue.ofSerial(typedValue.componentType, val), childIndex)); + } + } + listVector.endValue(index, values.size()); + listVector.setValueCount(index + 1); + return true; + } return false; } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index 9e805fc79bcf8..5fa3ba38f2506 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -119,12 +119,22 @@ private void bind(FieldVector vector, TypedValue typedValue, int index) { } } - private static class BinderVisitor implements ArrowType.ArrowTypeVisitor { + /** + * ArrowTypeVisitor that binds Avatica TypedValues to the given FieldVector at the specified index. + */ + public static class BinderVisitor implements ArrowType.ArrowTypeVisitor { private final FieldVector vector; private final TypedValue typedValue; private final int index; - private BinderVisitor(FieldVector vector, TypedValue value, int index) { + /** + * Instantiate a new BinderVisitor. + * + * @param vector FieldVector to bind values to. + * @param value TypedValue to bind. + * @param index Vector index (0-based) to bind the value to. + */ + public BinderVisitor(FieldVector vector, TypedValue value, int index) { this.vector = vector; this.typedValue = value; this.index = index; diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java index b19f049544ada..0b521a704bb6a 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ArrowFlightPreparedStatementTest.java @@ -27,6 +27,7 @@ import java.sql.SQLException; import java.util.Arrays; import java.util.Collections; +import java.util.List; import org.apache.arrow.driver.jdbc.utils.CoreMockedSqlProducers; import org.apache.arrow.driver.jdbc.utils.MockFlightSqlProducer; @@ -38,6 +39,7 @@ import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.Text; import org.junit.AfterClass; @@ -89,6 +91,14 @@ public void testSimpleQueryNoParameterBinding() throws SQLException { public void testQueryWithParameterBinding() throws SQLException { final String query = "Fake query with parameters"; final Schema schema = new Schema(Collections.singletonList(Field.nullable("", Types.MinorType.INT.getType()))); + final Schema parameterSchema = new Schema(Arrays.asList( + Field.nullable("", ArrowType.Utf8.INSTANCE), + new Field("", FieldType.nullable(ArrowType.List.INSTANCE), + Collections.singletonList(Field.nullable("", Types.MinorType.INT.getType()))))); + final List> expected = Collections.singletonList(Arrays.asList( + new Text("foo"), + new Integer[]{1, 2, null})); + PRODUCER.addSelectQuery(query, schema, Collections.singletonList(listener -> { try (final BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); @@ -105,11 +115,12 @@ public void testQueryWithParameterBinding() throws SQLException { } })); - PRODUCER.addExpectedParameters(query, - new Schema(Collections.singletonList(Field.nullable("", ArrowType.Utf8.INSTANCE))), - Collections.singletonList(Collections.singletonList(new Text("foo".getBytes(StandardCharsets.UTF_8))))); + PRODUCER.addExpectedParameters(query, parameterSchema, expected); + try (final PreparedStatement preparedStatement = connection.prepareStatement(query)) { preparedStatement.setString(1, "foo"); + preparedStatement.setArray(2, connection.createArrayOf("INTEGER", new Integer[]{1, 2, null})); + try (final ResultSet resultSet = preparedStatement.executeQuery()) { resultSet.next(); assert true; @@ -171,17 +182,29 @@ public void testUpdateQueryWithParameters() throws SQLException { @Test public void testUpdateQueryWithBatchedParameters() throws SQLException { String query = "Fake update with batched parameters"; - PRODUCER.addUpdateQuery(query, /*updatedRows*/42); - PRODUCER.addExpectedParameters(query, - new Schema(Collections.singletonList(Field.nullable("", ArrowType.Utf8.INSTANCE))), + Schema parameterSchema = new Schema(Arrays.asList( + Field.nullable("", ArrowType.Utf8.INSTANCE), + new Field("", FieldType.nullable(ArrowType.List.INSTANCE), + Collections.singletonList(Field.nullable("", Types.MinorType.INT.getType()))))); + List> expected = Arrays.asList( + Arrays.asList( + new Text("foo"), + new Integer[]{1, 2, null}), Arrays.asList( - Collections.singletonList(new Text("foo".getBytes(StandardCharsets.UTF_8))), - Collections.singletonList(new Text("bar".getBytes(StandardCharsets.UTF_8))))); + new Text("bar"), + new Integer[]{0, -1, 100000}) + ); + + PRODUCER.addUpdateQuery(query, /*updatedRows*/42); + PRODUCER.addExpectedParameters(query, parameterSchema, expected); + try (final PreparedStatement stmt = connection.prepareStatement(query)) { // TODO: make sure this is validated on the server too stmt.setString(1, "foo"); + stmt.setArray(2, connection.createArrayOf("INTEGER", new Integer[]{1, 2, null})); stmt.addBatch(); stmt.setString(1, "bar"); + stmt.setArray(2, connection.createArrayOf("INTEGER", new Integer[]{0, -1, 100000})); stmt.addBatch(); int[] updated = stmt.executeBatch(); assertEquals(42, updated[0]); diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java index 52910812fb4fb..231371a923a28 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/ResultSetTest.java @@ -25,10 +25,12 @@ import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.nio.charset.StandardCharsets; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; @@ -469,4 +471,14 @@ public void testShouldRunSelectQueryWithEmptyVectorsEmbedded() throws Exception assertEquals(2, rowCount); } } + + @Test + public void testResultSetAppMetadata() throws Exception { + try (Statement statement = connection.createStatement(); + ResultSet resultSet = statement.executeQuery( + CoreMockedSqlProducers.LEGACY_REGULAR_SQL_CMD)) { + assertArrayEquals(((ArrowFlightJdbcFlightStreamResultSet) resultSet).getAppMetadata(), + "foo".getBytes(StandardCharsets.UTF_8)); + } + } } diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java index 2b65f8f5a07ba..f36956f193ce8 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/utils/MockFlightSqlProducer.java @@ -29,6 +29,7 @@ import java.nio.channels.Channels; import java.nio.charset.StandardCharsets; import java.util.AbstractMap.SimpleImmutableEntry; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -80,6 +81,7 @@ import org.apache.arrow.vector.ipc.WriteChannel; import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.JsonStringArrayList; import org.apache.calcite.avatica.Meta.StatementType; import com.google.protobuf.Any; @@ -270,7 +272,9 @@ public FlightInfo getFlightInfoStatement(final CommandStatementQuery commandStat .map(TicketConversionUtils::getTicketStatementQueryFromHandle) .map(TicketConversionUtils::getEndpointFromMessage) .collect(toList()); - return new FlightInfo(queryInfo.getKey(), flightDescriptor, endpoints, -1, -1); + return FlightInfo.builder(queryInfo.getKey(), flightDescriptor, endpoints) + .setAppMetadata("foo".getBytes(StandardCharsets.UTF_8)) + .build(); } @Override @@ -293,7 +297,9 @@ public FlightInfo getFlightInfoPreparedStatement( .map(TicketConversionUtils::getCommandPreparedStatementQueryFromHandle) .map(TicketConversionUtils::getEndpointFromMessage) .collect(toList()); - return new FlightInfo(queryInfo.getKey(), flightDescriptor, endpoints, -1, -1); + return FlightInfo.builder(queryInfo.getKey(), flightDescriptor, endpoints) + .setAppMetadata("foo".getBytes(StandardCharsets.UTF_8)) + .build(); } @Override @@ -373,7 +379,13 @@ private boolean validateParameters(String query, for (int paramIndex = 0; paramIndex < expectedRow.size(); paramIndex++) { Object expected = expectedRow.get(paramIndex); Object actual = root.getVector(paramIndex).getObject(i); - if (!Objects.equals(expected, actual)) { + boolean matches; + if (expected.getClass().isArray()) { + matches = Arrays.equals((Object[]) expected, ((JsonStringArrayList) actual).toArray()); + } else { + matches = Objects.equals(expected, actual); + } + if (!matches) { streamListener.onError(CallStatus.INVALID_ARGUMENT .withDescription(String.format("Parameter mismatch. Expected: %s Actual: %s", expected, actual)) .toRuntimeException()); diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index 263538ba48b37..84462f54950ba 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -107,7 +107,7 @@ joda-time joda-time - 2.10.14 + 2.12.5 runtime @@ -159,7 +159,6 @@ org.apache.maven.plugins maven-shade-plugin - 3.4.1 package diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml index 09100d9731ad5..25478e58d2986 100644 --- a/java/flight/flight-sql/pom.xml +++ b/java/flight/flight-sql/pom.xml @@ -53,6 +53,7 @@ org.apache.arrow arrow-jdbc + test com.google.guava @@ -70,6 +71,7 @@ org.slf4j slf4j-api + test org.apache.derby diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Binary.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Binary.java index f2ea525027871..08674cbccae98 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Binary.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Binary.java @@ -18,17 +18,29 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Opaque binary data */ +@SuppressWarnings("unused") public final class Binary extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Binary getRootAsBinary(ByteBuffer _bb) { return getRootAsBinary(_bb, new Binary()); } public static Binary getRootAsBinary(ByteBuffer _bb, Binary obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/BinaryView.java b/java/format/src/main/java/org/apache/arrow/flatbuf/BinaryView.java index 56a8d329532c0..6b58f1c6738f1 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/BinaryView.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/BinaryView.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Logically the same as Binary, but the internal representation uses a view * struct that contains the string length and either the string's entire data @@ -33,8 +44,9 @@ * Since it uses a variable number of data buffers, each Field with this type * must have a corresponding entry in `variadicBufferCounts`. */ +@SuppressWarnings("unused") public final class BinaryView extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static BinaryView getRootAsBinaryView(ByteBuffer _bb) { return getRootAsBinaryView(_bb, new BinaryView()); } public static BinaryView getRootAsBinaryView(ByteBuffer _bb, BinaryView obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Block.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Block.java index e1435f83250d6..c56d52463af85 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Block.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Block.java @@ -18,10 +18,22 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Block extends Struct { diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompression.java b/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompression.java index ed8ce0939a044..b33006081154a 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompression.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompression.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Optional compression for the memory buffers constituting IPC message * bodies. Intended for use with RecordBatch but could be used for other * message types */ +@SuppressWarnings("unused") public final class BodyCompression extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static BodyCompression getRootAsBodyCompression(ByteBuffer _bb) { return getRootAsBodyCompression(_bb, new BodyCompression()); } public static BodyCompression getRootAsBodyCompression(ByteBuffer _bb, BodyCompression obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompressionMethod.java b/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompressionMethod.java index 48cff16e7511e..eb63b073d7027 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompressionMethod.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/BodyCompressionMethod.java @@ -23,6 +23,7 @@ * strategies for compressing the IPC message body (like whole-body * compression rather than buffer-level) in the future */ +@SuppressWarnings("unused") public final class BodyCompressionMethod { private BodyCompressionMethod() { } /** diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Bool.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Bool.java index e6b54e4b7bfa8..1f0ec75d6925f 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Bool.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Bool.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Bool extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Bool getRootAsBool(ByteBuffer _bb) { return getRootAsBool(_bb, new Bool()); } public static Bool getRootAsBool(ByteBuffer _bb, Bool obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Buffer.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Buffer.java index 589ed0b711ea0..c92c0c1751313 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Buffer.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Buffer.java @@ -18,16 +18,28 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * A Buffer represents a single contiguous memory segment */ +@SuppressWarnings("unused") public final class Buffer extends Struct { public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } public Buffer __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/CompressionType.java b/java/format/src/main/java/org/apache/arrow/flatbuf/CompressionType.java index 0597ffd30abb8..c9850a2633f97 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/CompressionType.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/CompressionType.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class CompressionType { private CompressionType() { } public static final byte LZ4_FRAME = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Date.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Date.java index ac6e389835a43..d52b0b18864ba 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Date.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Date.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Date is either a 32-bit or 64-bit signed integer type representing an * elapsed time since UNIX epoch (1970-01-01), stored in either of two units: @@ -32,8 +43,9 @@ * leap seconds), where the values are evenly divisible by 86400000 * * Days (32 bits) since the UNIX epoch */ +@SuppressWarnings("unused") public final class Date extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Date getRootAsDate(ByteBuffer _bb) { return getRootAsDate(_bb, new Date()); } public static Date getRootAsDate(ByteBuffer _bb, Date obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/DateUnit.java b/java/format/src/main/java/org/apache/arrow/flatbuf/DateUnit.java index f2c96f45b2ec0..75b8b4cf7a32e 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/DateUnit.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/DateUnit.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class DateUnit { private DateUnit() { } public static final short DAY = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Decimal.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Decimal.java index 8ffaa1ebb7326..87c7c7bc05d3a 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Decimal.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Decimal.java @@ -18,20 +18,32 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Exact decimal value represented as an integer value in two's * complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers * are used. The representation uses the endianness indicated * in the Schema. */ +@SuppressWarnings("unused") public final class Decimal extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Decimal getRootAsDecimal(ByteBuffer _bb) { return getRootAsDecimal(_bb, new Decimal()); } public static Decimal getRootAsDecimal(ByteBuffer _bb, Decimal obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryBatch.java b/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryBatch.java index fe6c59fb51e6d..f0e894238e9f4 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryBatch.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryBatch.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * For sending dictionary encoding information. Any Field can be * dictionary-encoded, but in this case none of its children may be @@ -32,8 +43,9 @@ * may be spread across multiple dictionary batches by using the isDelta * flag */ +@SuppressWarnings("unused") public final class DictionaryBatch extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static DictionaryBatch getRootAsDictionaryBatch(ByteBuffer _bb) { return getRootAsDictionaryBatch(_bb, new DictionaryBatch()); } public static DictionaryBatch getRootAsDictionaryBatch(ByteBuffer _bb, DictionaryBatch obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryEncoding.java b/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryEncoding.java index 8b2bb73e794de..c314ebb124e25 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryEncoding.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryEncoding.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class DictionaryEncoding extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static DictionaryEncoding getRootAsDictionaryEncoding(ByteBuffer _bb) { return getRootAsDictionaryEncoding(_bb, new DictionaryEncoding()); } public static DictionaryEncoding getRootAsDictionaryEncoding(ByteBuffer _bb, DictionaryEncoding obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryKind.java b/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryKind.java index ecefa4b765508..ed9337a560279 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryKind.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/DictionaryKind.java @@ -25,6 +25,7 @@ * Dictionaries might be explicit maps between integers and values * allowing for non-contiguous index values */ +@SuppressWarnings("unused") public final class DictionaryKind { private DictionaryKind() { } public static final short DenseArray = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Duration.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Duration.java index e1495f3002dd8..1518bd599da95 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Duration.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Duration.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Duration extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Duration getRootAsDuration(ByteBuffer _bb) { return getRootAsDuration(_bb, new Duration()); } public static Duration getRootAsDuration(ByteBuffer _bb, Duration obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Endianness.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Endianness.java index 494a3dcf57f21..266cb73d505c9 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Endianness.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Endianness.java @@ -22,6 +22,7 @@ * ---------------------------------------------------------------------- * Endianness of the platform producing the data */ +@SuppressWarnings("unused") public final class Endianness { private Endianness() { } public static final short Little = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Feature.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Feature.java index a4fa84c372854..c0428c0bf44a3 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Feature.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Feature.java @@ -37,26 +37,23 @@ * to facilitate exchanging and comparing bitmaps for supported * features. */ +@SuppressWarnings("unused") public final class Feature { private Feature() { } /** * Needed to make flatbuffers happy. */ - public static final long UNUSED = 0; + public static final long UNUSED = 0L; /** * The stream makes use of multiple full dictionaries with the * same ID and assumes clients implement dictionary replacement * correctly. */ - public static final long DICTIONARY_REPLACEMENT = 1; + public static final long DICTIONARY_REPLACEMENT = 1L; /** * The stream makes use of compressed bodies as described * in Message.fbs. */ - public static final long COMPRESSED_BODY = 2; - - public static final String[] names = { "UNUSED", "DICTIONARY_REPLACEMENT", "COMPRESSED_BODY", }; - - public static String name(int e) { return names[e]; } + public static final long COMPRESSED_BODY = 2L; } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Field.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Field.java index d34501e0ac2a4..9724070bc00fd 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Field.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Field.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * A field represents a named column in a record / row batch or child of a * nested type. */ +@SuppressWarnings("unused") public final class Field extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Field getRootAsField(ByteBuffer _bb) { return getRootAsField(_bb, new Field()); } public static Field getRootAsField(ByteBuffer _bb, Field obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } @@ -77,18 +89,18 @@ public final class Field extends Table { public static int createField(FlatBufferBuilder builder, int nameOffset, boolean nullable, - byte type_type, + byte typeType, int typeOffset, int dictionaryOffset, int childrenOffset, - int custom_metadataOffset) { + int customMetadataOffset) { builder.startTable(7); - Field.addCustomMetadata(builder, custom_metadataOffset); + Field.addCustomMetadata(builder, customMetadataOffset); Field.addChildren(builder, childrenOffset); Field.addDictionary(builder, dictionaryOffset); Field.addType(builder, typeOffset); Field.addName(builder, nameOffset); - Field.addTypeType(builder, type_type); + Field.addTypeType(builder, typeType); Field.addNullable(builder, nullable); return Field.endField(builder); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/FieldNode.java b/java/format/src/main/java/org/apache/arrow/flatbuf/FieldNode.java index 3ea9805f6bc2d..16594fb7d9b87 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/FieldNode.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/FieldNode.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * Data structures for describing a table row batch (a collection of @@ -35,6 +46,7 @@ * would have {length: 5, null_count: 2} for its List node, and {length: 6, * null_count: 0} for its Int16 node, as separate FieldNode structs */ +@SuppressWarnings("unused") public final class FieldNode extends Struct { public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } public FieldNode __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeBinary.java b/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeBinary.java index 287b34e22580b..8e090a30e1a6c 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeBinary.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeBinary.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class FixedSizeBinary extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static FixedSizeBinary getRootAsFixedSizeBinary(ByteBuffer _bb) { return getRootAsFixedSizeBinary(_bb, new FixedSizeBinary()); } public static FixedSizeBinary getRootAsFixedSizeBinary(ByteBuffer _bb, FixedSizeBinary obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeList.java b/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeList.java index d0d88923871f2..ce96771819186 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeList.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/FixedSizeList.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class FixedSizeList extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static FixedSizeList getRootAsFixedSizeList(ByteBuffer _bb) { return getRootAsFixedSizeList(_bb, new FixedSizeList()); } public static FixedSizeList getRootAsFixedSizeList(ByteBuffer _bb, FixedSizeList obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/FloatingPoint.java b/java/format/src/main/java/org/apache/arrow/flatbuf/FloatingPoint.java index 945fa627d4ddb..e602ebedd35df 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/FloatingPoint.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/FloatingPoint.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class FloatingPoint extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static FloatingPoint getRootAsFloatingPoint(ByteBuffer _bb) { return getRootAsFloatingPoint(_bb, new FloatingPoint()); } public static FloatingPoint getRootAsFloatingPoint(ByteBuffer _bb, FloatingPoint obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Footer.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Footer.java index 86fd75e03bdfe..9cc7d67bcb9d2 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Footer.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Footer.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * Arrow File metadata * */ +@SuppressWarnings("unused") public final class Footer extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Footer getRootAsFooter(ByteBuffer _bb) { return getRootAsFooter(_bb, new Footer()); } public static Footer getRootAsFooter(ByteBuffer _bb, Footer obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } @@ -63,9 +75,9 @@ public static int createFooter(FlatBufferBuilder builder, int schemaOffset, int dictionariesOffset, int recordBatchesOffset, - int custom_metadataOffset) { + int customMetadataOffset) { builder.startTable(5); - Footer.addCustomMetadata(builder, custom_metadataOffset); + Footer.addCustomMetadata(builder, customMetadataOffset); Footer.addRecordBatches(builder, recordBatchesOffset); Footer.addDictionaries(builder, dictionariesOffset); Footer.addSchema(builder, schemaOffset); diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Int.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Int.java index 94cb96a05f3f3..8bb2866c38480 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Int.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Int.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Int extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Int getRootAsInt(ByteBuffer _bb) { return getRootAsInt(_bb, new Int()); } public static Int getRootAsInt(ByteBuffer _bb, Int obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } @@ -36,10 +48,10 @@ public final class Int extends Table { public static int createInt(FlatBufferBuilder builder, int bitWidth, - boolean is_signed) { + boolean isSigned) { builder.startTable(2); Int.addBitWidth(builder, bitWidth); - Int.addIsSigned(builder, is_signed); + Int.addIsSigned(builder, isSigned); return Int.endInt(builder); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Interval.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Interval.java index e690b0badde59..c2cd1e5d6acc0 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Interval.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Interval.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Interval extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Interval getRootAsInterval(ByteBuffer _bb) { return getRootAsInterval(_bb, new Interval()); } public static Interval getRootAsInterval(ByteBuffer _bb, Interval obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/IntervalUnit.java b/java/format/src/main/java/org/apache/arrow/flatbuf/IntervalUnit.java index 2b1e8248a713a..8a8332468b7bb 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/IntervalUnit.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/IntervalUnit.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class IntervalUnit { private IntervalUnit() { } public static final short YEAR_MONTH = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/KeyValue.java b/java/format/src/main/java/org/apache/arrow/flatbuf/KeyValue.java index 0c6e9f66ea82a..dcbd0744fd0d1 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/KeyValue.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/KeyValue.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * user defined key value pairs to add custom metadata to arrow * key namespacing is the responsibility of the user */ +@SuppressWarnings("unused") public final class KeyValue extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static KeyValue getRootAsKeyValue(ByteBuffer _bb) { return getRootAsKeyValue(_bb, new KeyValue()); } public static KeyValue getRootAsKeyValue(ByteBuffer _bb, KeyValue obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeBinary.java b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeBinary.java index b7377bbe947a4..65d9314858885 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeBinary.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeBinary.java @@ -18,18 +18,30 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Same as Binary, but with 64-bit offsets, allowing to represent * extremely large data values. */ +@SuppressWarnings("unused") public final class LargeBinary extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static LargeBinary getRootAsLargeBinary(ByteBuffer _bb) { return getRootAsLargeBinary(_bb, new LargeBinary()); } public static LargeBinary getRootAsLargeBinary(ByteBuffer _bb, LargeBinary obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeList.java b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeList.java index 32cc0034c4650..144454d5bfaf5 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeList.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeList.java @@ -18,18 +18,30 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Same as List, but with 64-bit offsets, allowing to represent * extremely large data values. */ +@SuppressWarnings("unused") public final class LargeList extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static LargeList getRootAsLargeList(ByteBuffer _bb) { return getRootAsLargeList(_bb, new LargeList()); } public static LargeList getRootAsLargeList(ByteBuffer _bb, LargeList obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeListView.java b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeListView.java index 08c31c23a943f..c88ff0bdc951a 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeListView.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeListView.java @@ -18,18 +18,30 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Same as ListView, but with 64-bit offsets and sizes, allowing to represent * extremely large data values. */ +@SuppressWarnings("unused") public final class LargeListView extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static LargeListView getRootAsLargeListView(ByteBuffer _bb) { return getRootAsLargeListView(_bb, new LargeListView()); } public static LargeListView getRootAsLargeListView(ByteBuffer _bb, LargeListView obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeUtf8.java b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeUtf8.java index 7e7a20117deee..4c3532250dd9d 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/LargeUtf8.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/LargeUtf8.java @@ -18,18 +18,30 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Same as Utf8, but with 64-bit offsets, allowing to represent * extremely large data values. */ +@SuppressWarnings("unused") public final class LargeUtf8 extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static LargeUtf8 getRootAsLargeUtf8(ByteBuffer _bb) { return getRootAsLargeUtf8(_bb, new LargeUtf8()); } public static LargeUtf8 getRootAsLargeUtf8(ByteBuffer _bb, LargeUtf8 obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/List.java b/java/format/src/main/java/org/apache/arrow/flatbuf/List.java index 4493f9c5b3ebc..00b56652da884 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/List.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/List.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class List extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static List getRootAsList(ByteBuffer _bb) { return getRootAsList(_bb, new List()); } public static List getRootAsList(ByteBuffer _bb, List obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/ListView.java b/java/format/src/main/java/org/apache/arrow/flatbuf/ListView.java index 2c9ad4c13d884..e211600c0eefc 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/ListView.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/ListView.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Represents the same logical types that List can, but contains offsets and * sizes allowing for writes in any order and sharing of child values among * list values. */ +@SuppressWarnings("unused") public final class ListView extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static ListView getRootAsListView(ByteBuffer _bb) { return getRootAsListView(_bb, new ListView()); } public static ListView getRootAsListView(ByteBuffer _bb, ListView obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Map.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Map.java index 704426e92d4fb..b8a36a8b19f79 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Map.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Map.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * A Map is a logical nested type that is represented as * @@ -51,8 +62,9 @@ * for Map can make Map an alias for List. The "layout" attribute for the Map * field must have the same contents as a List. */ +@SuppressWarnings("unused") public final class Map extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Map getRootAsMap(ByteBuffer _bb) { return getRootAsMap(_bb, new Map()); } public static Map getRootAsMap(ByteBuffer _bb, Map obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Message.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Message.java index c7738ad95a2b6..32941ad7b3d77 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Message.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Message.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Message extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Message getRootAsMessage(ByteBuffer _bb) { return getRootAsMessage(_bb, new Message()); } public static Message getRootAsMessage(ByteBuffer _bb, Message obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } @@ -43,16 +55,16 @@ public final class Message extends Table { public static int createMessage(FlatBufferBuilder builder, short version, - byte header_type, + byte headerType, int headerOffset, long bodyLength, - int custom_metadataOffset) { + int customMetadataOffset) { builder.startTable(5); Message.addBodyLength(builder, bodyLength); - Message.addCustomMetadata(builder, custom_metadataOffset); + Message.addCustomMetadata(builder, customMetadataOffset); Message.addHeader(builder, headerOffset); Message.addVersion(builder, version); - Message.addHeaderType(builder, header_type); + Message.addHeaderType(builder, headerType); return Message.endMessage(builder); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/MessageHeader.java b/java/format/src/main/java/org/apache/arrow/flatbuf/MessageHeader.java index 179b6ba0f5437..cc23ce3728cb4 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/MessageHeader.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/MessageHeader.java @@ -28,6 +28,7 @@ * which may include experimental metadata types. For maximum compatibility, * it is best to send data using RecordBatch */ +@SuppressWarnings("unused") public final class MessageHeader { private MessageHeader() { } public static final byte NONE = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/MetadataVersion.java b/java/format/src/main/java/org/apache/arrow/flatbuf/MetadataVersion.java index 8ce9d84fc2b2d..83b44b50668c6 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/MetadataVersion.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/MetadataVersion.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class MetadataVersion { private MetadataVersion() { } /** @@ -37,7 +38,7 @@ private MetadataVersion() { } */ public static final short V4 = 3; /** - * >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + * >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 * metadata and IPC messages). Implementations are recommended to provide a * V4 compatibility mode with V5 format changes disabled. * diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Null.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Null.java index b7a30f2e82216..e0cbf21b723c5 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Null.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Null.java @@ -18,17 +18,29 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * These are stored in the flatbuffer in the Type union below */ +@SuppressWarnings("unused") public final class Null extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Null getRootAsNull(ByteBuffer _bb) { return getRootAsNull(_bb, new Null()); } public static Null getRootAsNull(ByteBuffer _bb, Null obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Precision.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Precision.java index e2c42237a6730..ff36882576742 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Precision.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Precision.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class Precision { private Precision() { } public static final short HALF = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/RecordBatch.java b/java/format/src/main/java/org/apache/arrow/flatbuf/RecordBatch.java index ce907ee0fdcda..286ef0840a613 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/RecordBatch.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/RecordBatch.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * A data header describing the shared memory layout of a "record" or "row" * batch. Some systems call this a "row batch" internally and others a "record * batch". */ +@SuppressWarnings("unused") public final class RecordBatch extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static RecordBatch getRootAsRecordBatch(ByteBuffer _bb) { return getRootAsRecordBatch(_bb, new RecordBatch()); } public static RecordBatch getRootAsRecordBatch(ByteBuffer _bb, RecordBatch obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/RunEndEncoded.java b/java/format/src/main/java/org/apache/arrow/flatbuf/RunEndEncoded.java index d48733ef0c826..89e8f0039a5ca 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/RunEndEncoded.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/RunEndEncoded.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Contains two child arrays, run_ends and values. * The run_ends child array must be a 16/32/64-bit integer array @@ -31,8 +42,9 @@ * each corresponding index in the values child array ends. * Like list/struct types, the value array can be of any type. */ +@SuppressWarnings("unused") public final class RunEndEncoded extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static RunEndEncoded getRootAsRunEndEncoded(ByteBuffer _bb) { return getRootAsRunEndEncoded(_bb, new RunEndEncoded()); } public static RunEndEncoded getRootAsRunEndEncoded(ByteBuffer _bb, RunEndEncoded obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Schema.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Schema.java index 69c025254b2fd..a81c052ec0907 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Schema.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Schema.java @@ -18,18 +18,30 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * A Schema describes the columns in a row batch */ +@SuppressWarnings("unused") public final class Schema extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Schema getRootAsSchema(ByteBuffer _bb) { return getRootAsSchema(_bb, new Schema()); } public static Schema getRootAsSchema(ByteBuffer _bb, Schema obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } @@ -64,11 +76,11 @@ public final class Schema extends Table { public static int createSchema(FlatBufferBuilder builder, short endianness, int fieldsOffset, - int custom_metadataOffset, + int customMetadataOffset, int featuresOffset) { builder.startTable(4); Schema.addFeatures(builder, featuresOffset); - Schema.addCustomMetadata(builder, custom_metadataOffset); + Schema.addCustomMetadata(builder, customMetadataOffset); Schema.addFields(builder, fieldsOffset); Schema.addEndianness(builder, endianness); return Schema.endSchema(builder); diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixCompressedAxis.java b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixCompressedAxis.java index 2ad314f2e85ab..ddfc63a11867d 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixCompressedAxis.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixCompressedAxis.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class SparseMatrixCompressedAxis { private SparseMatrixCompressedAxis() { } public static final short Row = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixIndexCSX.java b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixIndexCSX.java index 9516a6ec146ac..9874899141172 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixIndexCSX.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseMatrixIndexCSX.java @@ -18,17 +18,29 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Compressed Sparse format, that is matrix-specific. */ +@SuppressWarnings("unused") public final class SparseMatrixIndexCSX extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static SparseMatrixIndexCSX getRootAsSparseMatrixIndexCSX(ByteBuffer _bb) { return getRootAsSparseMatrixIndexCSX(_bb, new SparseMatrixIndexCSX()); } public static SparseMatrixIndexCSX getRootAsSparseMatrixIndexCSX(ByteBuffer _bb, SparseMatrixIndexCSX obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensor.java b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensor.java index 9b4cdf6e8917a..62b4274146c08 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensor.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensor.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class SparseTensor extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static SparseTensor getRootAsSparseTensor(ByteBuffer _bb) { return getRootAsSparseTensor(_bb, new SparseTensor()); } public static SparseTensor getRootAsSparseTensor(ByteBuffer _bb, SparseTensor obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndex.java b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndex.java index 5b9444abcf004..f2a0455312b51 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndex.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndex.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class SparseTensorIndex { private SparseTensorIndex() { } public static final byte NONE = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCOO.java b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCOO.java index a84238d662d1e..80c524c75c7e3 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCOO.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCOO.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * EXPERIMENTAL: Data structures for sparse tensors @@ -58,8 +69,9 @@ * (row-major order), and it does not have duplicated entries. Otherwise, * the indices may not be sorted, or may have duplicated entries. */ +@SuppressWarnings("unused") public final class SparseTensorIndexCOO extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static SparseTensorIndexCOO getRootAsSparseTensorIndexCOO(ByteBuffer _bb) { return getRootAsSparseTensorIndexCOO(_bb, new SparseTensorIndexCOO()); } public static SparseTensorIndexCOO getRootAsSparseTensorIndexCOO(ByteBuffer _bb, SparseTensorIndexCOO obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCSF.java b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCSF.java index abc4662be1857..87f269de551f5 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCSF.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/SparseTensorIndexCSF.java @@ -18,17 +18,29 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Compressed Sparse Fiber (CSF) sparse tensor index. */ +@SuppressWarnings("unused") public final class SparseTensorIndexCSF extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static SparseTensorIndexCSF getRootAsSparseTensorIndexCSF(ByteBuffer _bb) { return getRootAsSparseTensorIndexCSF(_bb, new SparseTensorIndexCSF()); } public static SparseTensorIndexCSF getRootAsSparseTensorIndexCSF(ByteBuffer _bb, SparseTensorIndexCSF obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Struct_.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Struct_.java index 1285f28843006..0517be6153c29 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Struct_.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Struct_.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * A Struct_ in the flatbuffer metadata is the same as an Arrow Struct * (according to the physical memory layout). We used Struct_ here as * Struct is a reserved word in Flatbuffers */ +@SuppressWarnings("unused") public final class Struct_ extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Struct_ getRootAsStruct_(ByteBuffer _bb) { return getRootAsStruct_(_bb, new Struct_()); } public static Struct_ getRootAsStruct_(ByteBuffer _bb, Struct_ obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Tensor.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Tensor.java index d4466bcf2f56a..5892b6aca90f1 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Tensor.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Tensor.java @@ -18,14 +18,26 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; @SuppressWarnings("unused") public final class Tensor extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Tensor getRootAsTensor(ByteBuffer _bb) { return getRootAsTensor(_bb, new Tensor()); } public static Tensor getRootAsTensor(ByteBuffer _bb, Tensor obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/TensorDim.java b/java/format/src/main/java/org/apache/arrow/flatbuf/TensorDim.java index fad8caacd2e70..3c85786f409e8 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/TensorDim.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/TensorDim.java @@ -18,19 +18,31 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * ---------------------------------------------------------------------- * Data structures for dense tensors * Shape data for a single axis in a tensor */ +@SuppressWarnings("unused") public final class TensorDim extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static TensorDim getRootAsTensorDim(ByteBuffer _bb) { return getRootAsTensorDim(_bb, new TensorDim()); } public static TensorDim getRootAsTensorDim(ByteBuffer _bb, TensorDim obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Time.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Time.java index 9acc3fc7a5ea1..d5a320f493596 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Time.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Time.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Time is either a 32-bit or 64-bit signed integer type representing an * elapsed time since midnight, stored in either of four units: seconds, @@ -40,8 +51,9 @@ * measurements with leap seconds will need to be corrected when ingesting * into Arrow (for example by replacing the value 86400 with 86399). */ +@SuppressWarnings("unused") public final class Time extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Time getRootAsTime(ByteBuffer _bb) { return getRootAsTime(_bb, new Time()); } public static Time getRootAsTime(ByteBuffer _bb, Time obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/TimeUnit.java b/java/format/src/main/java/org/apache/arrow/flatbuf/TimeUnit.java index 828e44c13f900..2a7bfad5c614d 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/TimeUnit.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/TimeUnit.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class TimeUnit { private TimeUnit() { } public static final short SECOND = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Timestamp.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Timestamp.java index fe0c6aaea24fa..d906fbeea1b9b 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Timestamp.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Timestamp.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Timestamp is a 64-bit signed integer representing an elapsed time since a * fixed epoch, stored in either of four units: seconds, milliseconds, @@ -124,15 +135,16 @@ * no indication of how to map this information to a physical point in time. * Naive date-times must be handled with care because of this missing * information, and also because daylight saving time (DST) may make - * some values ambiguous or non-existent. A naive date-time may be + * some values ambiguous or nonexistent. A naive date-time may be * stored as a struct with Date and Time fields. However, it may also be * encoded into a Timestamp column with an empty timezone. The timestamp * values should be computed "as if" the timezone of the date-time values * was UTC; for example, the naive date-time "January 1st 1970, 00h00" would * be encoded as timestamp value 0. */ +@SuppressWarnings("unused") public final class Timestamp extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Timestamp getRootAsTimestamp(ByteBuffer _bb) { return getRootAsTimestamp(_bb, new Timestamp()); } public static Timestamp getRootAsTimestamp(ByteBuffer _bb, Timestamp obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Type.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Type.java index 29248bb23c303..07bb34b11ca12 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Type.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Type.java @@ -23,6 +23,7 @@ * Top-level Type value, enabling extensible type-specific metadata. We can * add new logical types to Type without breaking backwards compatibility */ +@SuppressWarnings("unused") public final class Type { private Type() { } public static final byte NONE = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Union.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Union.java index 7e282243425cd..1a106919cb137 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Union.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Union.java @@ -18,20 +18,32 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * A union is a complex type with children in Field * By default ids in the type vector refer to the offsets in the children * optionally typeIds provides an indirection between the child offset and the type id * for each child `typeIds[offset]` is the id used in the type vector */ +@SuppressWarnings("unused") public final class Union extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Union getRootAsUnion(ByteBuffer _bb) { return getRootAsUnion(_bb, new Union()); } public static Union getRootAsUnion(ByteBuffer _bb, Union obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/UnionMode.java b/java/format/src/main/java/org/apache/arrow/flatbuf/UnionMode.java index 23a6013f8e4f4..d66ddc318e81f 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/UnionMode.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/UnionMode.java @@ -18,6 +18,7 @@ package org.apache.arrow.flatbuf; +@SuppressWarnings("unused") public final class UnionMode { private UnionMode() { } public static final short Sparse = 0; diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8.java index d77fe205f422e..60933cf25aff5 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8.java @@ -18,17 +18,29 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Unicode with UTF-8 encoding */ +@SuppressWarnings("unused") public final class Utf8 extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Utf8 getRootAsUtf8(ByteBuffer _bb) { return getRootAsUtf8(_bb, new Utf8()); } public static Utf8 getRootAsUtf8(ByteBuffer _bb, Utf8 obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8View.java b/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8View.java index 035c977576e43..377ba034cbb7f 100644 --- a/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8View.java +++ b/java/format/src/main/java/org/apache/arrow/flatbuf/Utf8View.java @@ -18,12 +18,23 @@ package org.apache.arrow.flatbuf; -import java.nio.*; -import java.lang.*; -import java.util.*; -import com.google.flatbuffers.*; +import com.google.flatbuffers.BaseVector; +import com.google.flatbuffers.BooleanVector; +import com.google.flatbuffers.ByteVector; +import com.google.flatbuffers.Constants; +import com.google.flatbuffers.DoubleVector; +import com.google.flatbuffers.FlatBufferBuilder; +import com.google.flatbuffers.FloatVector; +import com.google.flatbuffers.IntVector; +import com.google.flatbuffers.LongVector; +import com.google.flatbuffers.ShortVector; +import com.google.flatbuffers.StringVector; +import com.google.flatbuffers.Struct; +import com.google.flatbuffers.Table; +import com.google.flatbuffers.UnionVector; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; -@SuppressWarnings("unused") /** * Logically the same as Utf8, but the internal representation uses a view * struct that contains the string length and either the string's entire data @@ -33,8 +44,9 @@ * Since it uses a variable number of data buffers, each Field with this type * must have a corresponding entry in `variadicBufferCounts`. */ +@SuppressWarnings("unused") public final class Utf8View extends Table { - public static void ValidateVersion() { Constants.FLATBUFFERS_1_12_0(); } + public static void ValidateVersion() { Constants.FLATBUFFERS_23_5_26(); } public static Utf8View getRootAsUtf8View(ByteBuffer _bb) { return getRootAsUtf8View(_bb, new Utf8View()); } public static Utf8View getRootAsUtf8View(ByteBuffer _bb, Utf8View obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__assign(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } public void __init(int _i, ByteBuffer _bb) { __reset(_i, _bb); } diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index cfda0b0c527a9..128fa1508fbd1 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -25,7 +25,7 @@ 1.8 1.8 - 3.20.3 + 3.25.1 true ../../../cpp/release-build diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java index b83adf9271d4b..cc615c5b38321 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/MemoryUtil.java @@ -141,8 +141,8 @@ public Object run() { // This exception will get swallowed, but it's necessary for the static analysis that ensures // the static fields above get initialized final RuntimeException failure = new RuntimeException( - "Failed to initialize MemoryUtil. Was Java started with " + - "`--add-opens=java.base/java.nio=ALL-UNNAMED`? " + + "Failed to initialize MemoryUtil. You must start Java with " + + "`--add-opens=java.base/java.nio=ALL-UNNAMED` " + "(See https://arrow.apache.org/docs/java/install.html)", e); failure.printStackTrace(); throw failure; diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml index 307f6ad81a0f1..e625cbeabc65a 100644 --- a/java/memory/memory-netty/pom.xml +++ b/java/memory/memory-netty/pom.xml @@ -41,7 +41,7 @@ ch.qos.logback logback-core - 1.2.12 + 1.3.14 test diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 102832491ec05..269ac72d83326 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -26,6 +26,7 @@ org.openjdk.jmh jmh-core ${jmh.version} + test org.openjdk.jmh @@ -37,10 +38,12 @@ org.apache.arrow arrow-vector ${arrow.vector.classifier} + test org.apache.arrow arrow-memory-core + test org.apache.arrow @@ -51,10 +54,12 @@ org.apache.avro avro ${dep.avro.version} + test org.apache.arrow arrow-avro + test com.h2database @@ -65,6 +70,7 @@ org.apache.arrow arrow-jdbc + test org.apache.arrow @@ -103,7 +109,6 @@ org.apache.maven.plugins maven-shade-plugin - 2.2 package diff --git a/java/pom.xml b/java/pom.xml index 2a9997b7012b7..cd26e79d47f3d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -30,22 +30,22 @@ ${project.build.directory}/generated-sources 1.9.0 - 5.9.0 - 1.7.25 - 31.1-jre + 5.10.1 + 2.0.7 + 32.1.3-jre 4.1.100.Final - 1.56.0 + 1.59.0 3.23.1 - 2.15.1 + 2.16.0 2.7.1 - 1.12.0 + 23.5.26 1.10.0 2 true 9+181-r4173-1 2.22.0 - 3.10.1 + 3.11.0 5.5.0 5.2.0 @@ -308,7 +308,7 @@ org.slf4j jcl-over-slf4j - 1.7.5 + ${dep.slf4j.version} @@ -361,7 +361,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.6 + 2.7.10 package @@ -378,7 +378,12 @@ org.apache.maven.plugins maven-dependency-plugin - 3.0.1 + + 3.1.2 org.apache.rat @@ -395,6 +400,7 @@ maven-compiler-plugin ${maven-compiler-plugin.version} + false org.immutables @@ -408,6 +414,11 @@ maven-enforcer-plugin 3.0.0-M2 + + org.apache.maven.plugins + maven-shade-plugin + 3.5.1 + maven-surefire-plugin 3.0.0-M7 @@ -687,7 +698,7 @@ ch.qos.logback logback-classic - 1.2.3 + 1.3.14 test diff --git a/java/tools/pom.xml b/java/tools/pom.xml index 128825c224369..8ea98a84b4ad1 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -37,6 +37,7 @@ com.google.guava guava + test commons-cli @@ -46,7 +47,7 @@ ch.qos.logback logback-classic - 1.2.3 + 1.3.14 runtime diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 4c8bf1e594aa4..17d8f312a52a5 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -174,7 +174,13 @@ org.apache.maven.plugins maven-shade-plugin - 3.1.1 + + 3.2.4 package diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java index de5eb80c0dfb9..d7b147feb152f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -57,6 +57,16 @@ public NullVector(String name) { this(name, FieldType.nullable(Types.MinorType.NULL.getType())); } + /** + * Instantiate a NullVector with the given number of values. + * + * @param name name of the vector + * @param valueCount number of values (i.e., nulls) in this vector. + */ + public NullVector(String name, int valueCount) { + this(new Field(name, FieldType.nullable(Types.MinorType.NULL.getType()), null), valueCount); + } + /** * Instantiate a NullVector. * @@ -73,8 +83,18 @@ public NullVector(String name, FieldType fieldType) { * @param field field materialized by this vector. */ public NullVector(Field field) { - this.valueCount = 0; + this(field, 0); + } + + /** + * Instantiate a NullVector with the given number of values. + * + * @param field field materialized by this vector. + * @param valueCount number of values (i.e., nulls) in this vector. + */ + public NullVector(Field field, int valueCount) { this.field = field; + this.valueCount = valueCount; } @Deprecated @@ -106,7 +126,7 @@ public Types.MinorType getMinorType() { @Override public TransferPair getTransferPair(BufferAllocator allocator) { - return getTransferPair((String) null, allocator); + return getTransferPair(getName(), allocator); } @Override @@ -159,12 +179,12 @@ public int getValueCapacity() { @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new TransferImpl(); + return new TransferImpl(ref); } @Override public TransferPair getTransferPair(Field field, BufferAllocator allocator) { - return new TransferImpl(); + return new TransferImpl(field.getName()); } @Override diff --git a/js/package.json b/js/package.json index 33bc4849903ef..d72fdd3177016 100644 --- a/js/package.json +++ b/js/package.json @@ -74,7 +74,7 @@ "@types/glob": "8.1.0", "@types/jest": "29.5.3", "@types/randomatic": "3.1.3", - "@typescript-eslint/eslint-plugin": "5.59.9", + "@typescript-eslint/eslint-plugin": "5.62.0", "@typescript-eslint/parser": "5.59.9", "async-done": "2.0.0", "benny": "3.7.1", @@ -85,9 +85,9 @@ "esbuild-plugin-alias": "0.2.1", "eslint": "8.52.0", "eslint-plugin-jest": "27.4.2", - "eslint-plugin-unicorn": "47.0.0", + "eslint-plugin-unicorn": "49.0.0", "esm": "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz", - "glob": "10.2.7", + "glob": "10.3.10", "google-closure-compiler": "20230802.0.0", "gulp": "4.0.2", "gulp-esbuild": "0.11.1", @@ -111,13 +111,13 @@ "ts-jest": "29.1.1", "ts-node": "10.9.1", "typedoc": "0.24.8", - "typescript": "5.1.3", + "typescript": "5.1.6", "vinyl-buffer": "1.0.1", "vinyl-named": "1.1.0", "vinyl-source-stream": "2.0.0", "web-streams-polyfill": "3.2.1", "webpack": "5.86.0", - "webpack-bundle-analyzer": "4.9.1", + "webpack-bundle-analyzer": "4.10.1", "webpack-stream": "7.0.0", "xml2js": "0.6.2" }, diff --git a/js/src/enum.ts b/js/src/enum.ts index 4e207dd37cec1..2a82dd4235c51 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -21,7 +21,7 @@ // v4 doesn't seem to be able to tree-shake the rest of those exports. // // We will have to keep these enums in sync when we re-generate the flatbuffers -// code from the shchemas. See js/DEVELOP.md for info on how to run flatbuffers +// code from the schemas. See js/DEVELOP.md for info on how to run flatbuffers // code generation. // //// @@ -174,7 +174,7 @@ export enum Type { FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */ FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ - Duration = 18, /** Measure of elapsed time in either seconds, miliseconds, microseconds or nanoseconds. */ + Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -215,7 +215,7 @@ export enum BufferType { OFFSET = 0, /** - * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + * actual data, either fixed width primitive types in slots or variable width delimited by an OFFSET vector */ DATA = 1, diff --git a/js/src/fb/timestamp.ts b/js/src/fb/timestamp.ts index 9c391802e89c9..636a83882a6d9 100644 --- a/js/src/fb/timestamp.ts +++ b/js/src/fb/timestamp.ts @@ -105,7 +105,7 @@ import { TimeUnit } from './time-unit.js'; * no indication of how to map this information to a physical point in time. * Naive date-times must be handled with care because of this missing * information, and also because daylight saving time (DST) may make - * some values ambiguous or non-existent. A naive date-time may be + * some values ambiguous or nonexistent. A naive date-time may be * stored as a struct with Date and Time fields. However, it may also be * encoded into a Timestamp column with an empty timezone. The timestamp * values should be computed "as if" the timezone of the date-time values diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts index b1ad5248d6158..e4dac0606aa47 100644 --- a/js/src/ipc/reader.ts +++ b/js/src/ipc/reader.ts @@ -185,7 +185,7 @@ export class RecordBatchReader extends ReadableInterop< // // Since TS is a structural type system, we define the following subclass stubs -// so that concrete types exist to associate with with the interfaces below. +// so that concrete types exist to associate with the interfaces below. // // The implementation for each RecordBatchReader is hidden away in the set of // `RecordBatchReaderImpl` classes in the second half of this file. This allows diff --git a/js/src/vector.ts b/js/src/vector.ts index 318ce06e5c3c0..8c9a3da66c92c 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -302,8 +302,8 @@ export class Vector { * values. * * Memoization is very useful when decoding a value is expensive such as - * Uft8. The memoization creates a cache of the size of the Vector and - * therfore increases memory usage. + * Utf8. The memoization creates a cache of the size of the Vector and + * therefore increases memory usage. * * @returns A new vector that memoizes calls to {@link get}. */ diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 2d20f2a8efd5c..54b5610a50eed 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -96,7 +96,7 @@ export class GetBuilderCtor extends Visitor { public visitDurationSecond() { return DurationSecondBuilder; } public visitDurationMillisecond() { return DurationMillisecondBuilder; } public visitDurationMicrosecond() { return DurationMicrosecondBuilder; } - public visistDurationNanosecond() { return DurationNanosecondBuilder; } + public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } } diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 28dcff20d3bd3..4cf0076b3c8e2 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -144,7 +144,7 @@ function indexOfValue(data: Data, searchElement?: T['TVal function indexOfUnion(data: Data, searchElement?: T['TValue'] | null, fromIndex?: number): number { // Unions are special -- they do have a nullBitmap, but so can their children. // If the searchElement is null, we don't know whether it came from the Union's - // bitmap or one of its childrens'. So we don't interrogate the Union's bitmap, + // bitmap or one of its children's. So we don't interrogate the Union's bitmap, // since that will report the wrong index if a child has a null before the Union. const get = getVisitor.getVisitFn(data); const compare = createElementComparator(searchElement); diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 55a6b4e2ea390..0af954e4adacc 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -62,9 +62,9 @@ export class JSONVectorAssembler extends Visitor { /** @nocollapse */ public static assemble(...batches: T[]) { - const assemlber = new JSONVectorAssembler(); + const assembler = new JSONVectorAssembler(); return batches.map(({ schema, data }) => { - return assemlber.visitMany(schema.fields, data.children); + return assembler.visitMany(schema.fields, data.children); }); } diff --git a/js/yarn.lock b/js/yarn.lock index eddf380d1ffad..bf22cce197c6b 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -189,12 +189,7 @@ resolved "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz#533f36457a25814cf1df6488523ad547d784a99f" integrity sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw== -"@babel/helper-validator-identifier@^7.19.1", "@babel/helper-validator-identifier@^7.22.19": - version "7.22.19" - resolved "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.19.tgz#2f34ab1e445f5b95e2e6edfe50ea2449e610583a" - integrity sha512-Tinq7ybnEPFFXhlYOYFiSjespWQk0dq2dRNAiMdRTOYQzEGqnnNyrTxPYHP5r6wGjlF1rFgABdDV0g8EwD6Qbg== - -"@babel/helper-validator-identifier@^7.22.20": +"@babel/helper-validator-identifier@^7.22.19", "@babel/helper-validator-identifier@^7.22.20": version "7.22.20" resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz#c4ae002c61d2879e724581d96665583dbc1dc0e0" integrity sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A== @@ -1185,9 +1180,9 @@ "@swc/core-win32-x64-msvc" "1.3.82" "@swc/helpers@^0.5.2": - version "0.5.2" - resolved "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.2.tgz#85ea0c76450b61ad7d10a37050289eded783c27d" - integrity sha512-E4KcWTpoLHqwPHLxidpOqQbcrZVgi0rsmmZXUle1jXmJfuIf/UWpczUJ7MZZ5tlxytgJXyp0w4PGkkeLiuIdZw== + version "0.5.3" + resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.3.tgz#98c6da1e196f5f08f977658b80d6bd941b5f294f" + integrity sha512-FaruWX6KdudYloq1AHD/4nU+UsMTdNE8CKyrseXWEcgjDAbvkwJg2QGPAnfIJLIWsjZOSPLOAykK6fuYp4vp4A== dependencies: tslib "^2.4.0" @@ -1365,9 +1360,9 @@ integrity sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw== "@types/pad-left@^2.1.1": - version "2.1.1" - resolved "https://registry.npmjs.org/@types/pad-left/-/pad-left-2.1.1.tgz#17d906fc75804e1cc722da73623f1d978f16a137" - integrity sha512-Xd22WCRBydkGSApl5Bw0PhAOHKSVjNL3E3AwzKaps96IMraPqy5BvZIsBVK6JLwdybUzjHnuWVwpDd0JjTfHXA== + version "2.1.3" + resolved "https://registry.yarnpkg.com/@types/pad-left/-/pad-left-2.1.3.tgz#f636e62154e95bf6660439c51fe828da918124b2" + integrity sha512-fayws3T8lGvIY3UEtqFHKSH6FS1Lepo6kd3ZTgdj8rsVIIwzr9MZJt1ZP9UGu+cdAZsJiG2d5iYxyhRXwtUB5A== "@types/randomatic@3.1.3": version "3.1.3" @@ -1416,17 +1411,17 @@ dependencies: "@types/yargs-parser" "*" -"@typescript-eslint/eslint-plugin@5.59.9": - version "5.59.9" - resolved "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.59.9.tgz#2604cfaf2b306e120044f901e20c8ed926debf15" - integrity sha512-4uQIBq1ffXd2YvF7MAvehWKW3zVv/w+mSfRAu+8cKbfj3nwzyqJLNcZJpQ/WZ1HLbJDiowwmQ6NO+63nCA+fqA== +"@typescript-eslint/eslint-plugin@5.62.0": + version "5.62.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.62.0.tgz#aeef0328d172b9e37d9bab6dbc13b87ed88977db" + integrity sha512-TiZzBSJja/LbhNPvk6yc0JrX9XqhQ0hdh6M2svYfsHGejaKFIAGd9MQ+ERIMzLGlN/kZoYIgdxFV0PuljTKXag== dependencies: "@eslint-community/regexpp" "^4.4.0" - "@typescript-eslint/scope-manager" "5.59.9" - "@typescript-eslint/type-utils" "5.59.9" - "@typescript-eslint/utils" "5.59.9" + "@typescript-eslint/scope-manager" "5.62.0" + "@typescript-eslint/type-utils" "5.62.0" + "@typescript-eslint/utils" "5.62.0" debug "^4.3.4" - grapheme-splitter "^1.0.4" + graphemer "^1.4.0" ignore "^5.2.0" natural-compare-lite "^1.4.0" semver "^7.3.7" @@ -1458,13 +1453,13 @@ "@typescript-eslint/types" "5.62.0" "@typescript-eslint/visitor-keys" "5.62.0" -"@typescript-eslint/type-utils@5.59.9": - version "5.59.9" - resolved "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-5.59.9.tgz#53bfaae2e901e6ac637ab0536d1754dfef4dafc2" - integrity sha512-ksEsT0/mEHg9e3qZu98AlSrONAQtrSTljL3ow9CGej8eRo7pe+yaC/mvTjptp23Xo/xIf2mLZKC6KPv4Sji26Q== +"@typescript-eslint/type-utils@5.62.0": + version "5.62.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-5.62.0.tgz#286f0389c41681376cdad96b309cedd17d70346a" + integrity sha512-xsSQreu+VnfbqQpW5vnCJdq1Z3Q0U31qiWmRhr98ONQmcp/yhiPJFPq8MXiJVLiksmOKSjIldZzkebzHuCGzew== dependencies: - "@typescript-eslint/typescript-estree" "5.59.9" - "@typescript-eslint/utils" "5.59.9" + "@typescript-eslint/typescript-estree" "5.62.0" + "@typescript-eslint/utils" "5.62.0" debug "^4.3.4" tsutils "^3.21.0" @@ -1504,21 +1499,7 @@ semver "^7.3.7" tsutils "^3.21.0" -"@typescript-eslint/utils@5.59.9": - version "5.59.9" - resolved "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-5.59.9.tgz#adee890107b5ffe02cd46fdaa6c2125fb3c6c7c4" - integrity sha512-1PuMYsju/38I5Ggblaeb98TOoUvjhRvLpLa1DoTOFaLWqaXl/1iQ1eGurTXgBY58NUdtfTXKP5xBq7q9NDaLKg== - dependencies: - "@eslint-community/eslint-utils" "^4.2.0" - "@types/json-schema" "^7.0.9" - "@types/semver" "^7.3.12" - "@typescript-eslint/scope-manager" "5.59.9" - "@typescript-eslint/types" "5.59.9" - "@typescript-eslint/typescript-estree" "5.59.9" - eslint-scope "^5.1.1" - semver "^7.3.7" - -"@typescript-eslint/utils@^5.10.0": +"@typescript-eslint/utils@5.62.0", "@typescript-eslint/utils@^5.10.0": version "5.62.0" resolved "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-5.62.0.tgz#141e809c71636e4a75daa39faed2fb5f4b10df86" integrity sha512-n8oxjeb5aIbPFEtmQxQYOLI0i9n5ySBEY/ZEHHZqKQSFnxio1rv6dthascc9dLuwrL0RC5mPCxB7vnAVGAYWAQ== @@ -2613,6 +2594,11 @@ d@1, d@^1.0.1: es5-ext "^0.10.50" type "^1.0.1" +debounce@^1.2.1: + version "1.2.1" + resolved "https://registry.yarnpkg.com/debounce/-/debounce-1.2.1.tgz#38881d8f4166a5c5848020c11827b834bcb3e0a5" + integrity sha512-XRRe6Glud4rd/ZGQfiV1ruXSfbvfJedlV9Y6zOlP+2K04vBYiJEte6stfFkCP03aMnY5tsipamumUjL14fofug== + debug-fabulous@^1.0.0: version "1.1.0" resolved "https://registry.npmjs.org/debug-fabulous/-/debug-fabulous-1.1.0.tgz#af8a08632465224ef4174a9f06308c3c2a1ebc8e" @@ -3005,12 +2991,12 @@ eslint-plugin-jest@27.4.2: dependencies: "@typescript-eslint/utils" "^5.10.0" -eslint-plugin-unicorn@47.0.0: - version "47.0.0" - resolved "https://registry.npmjs.org/eslint-plugin-unicorn/-/eslint-plugin-unicorn-47.0.0.tgz#960e9d3789f656ba3e21982420793b069a911011" - integrity sha512-ivB3bKk7fDIeWOUmmMm9o3Ax9zbMz1Bsza/R2qm46ufw4T6VBFBaJIR1uN3pCKSmSXm8/9Nri8V+iUut1NhQGA== +eslint-plugin-unicorn@49.0.0: + version "49.0.0" + resolved "https://registry.yarnpkg.com/eslint-plugin-unicorn/-/eslint-plugin-unicorn-49.0.0.tgz#4449ea954d7e1455eec8518f9417d7021b245fa8" + integrity sha512-0fHEa/8Pih5cmzFW5L7xMEfUTvI9WKeQtjmKpTUmY+BiFCDxkxrTdnURJOHKykhtwIeyYsxnecbGvDCml++z4Q== dependencies: - "@babel/helper-validator-identifier" "^7.19.1" + "@babel/helper-validator-identifier" "^7.22.20" "@eslint-community/eslint-utils" "^4.4.0" ci-info "^3.8.0" clean-regexp "^1.0.0" @@ -3018,13 +3004,11 @@ eslint-plugin-unicorn@47.0.0: indent-string "^4.0.0" is-builtin-module "^3.2.1" jsesc "^3.0.2" - lodash "^4.17.21" pluralize "^8.0.0" read-pkg-up "^7.0.1" - regexp-tree "^0.1.24" + regexp-tree "^0.1.27" regjsparser "^0.10.0" - safe-regex "^2.1.1" - semver "^7.3.8" + semver "^7.5.4" strip-indent "^3.0.0" eslint-scope@5.1.1, eslint-scope@^5.1.1: @@ -3607,16 +3591,16 @@ glob-watcher@^5.0.3: normalize-path "^3.0.0" object.defaults "^1.1.0" -glob@10.2.7: - version "10.2.7" - resolved "https://registry.npmjs.org/glob/-/glob-10.2.7.tgz#9dd2828cd5bc7bd861e7738d91e7113dda41d7d8" - integrity sha512-jTKehsravOJo8IJxUGfZILnkvVJM/MOfHRs8QcXolVef2zNI9Tqyy5+SeuOAZd3upViEZQLyFpQhYiHLrMUNmA== +glob@10.3.10: + version "10.3.10" + resolved "https://registry.yarnpkg.com/glob/-/glob-10.3.10.tgz#0351ebb809fd187fe421ab96af83d3a70715df4b" + integrity sha512-fa46+tv1Ak0UPK1TOy/pZrIybNNt4HCv7SDzwyfiOZkvZLEbjsZkJBPtDHVshZjbecAoAGSC20MjLDG/qr679g== dependencies: foreground-child "^3.1.0" - jackspeak "^2.0.3" + jackspeak "^2.3.5" minimatch "^9.0.1" - minipass "^5.0.0 || ^6.0.2" - path-scurry "^1.7.0" + minipass "^5.0.0 || ^6.0.2 || ^7.0.0" + path-scurry "^1.10.1" glob@^7.1.1, glob@^7.1.3, glob@^7.1.4: version "7.2.3" @@ -3739,11 +3723,6 @@ graceful-fs@^4.0.0, graceful-fs@^4.1.11, graceful-fs@^4.1.2, graceful-fs@^4.1.6, resolved "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz#4183e4e8bf08bb6e05bbb2f7d2e0c8f712ca40e3" integrity sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ== -grapheme-splitter@^1.0.4: - version "1.0.4" - resolved "https://registry.npmjs.org/grapheme-splitter/-/grapheme-splitter-1.0.4.tgz#9cf3a665c6247479896834af35cf1dbb4400767e" - integrity sha512-bzh50DW9kTPM00T8y4o8vQg89Di9oLJVLW/KaOGIXJWP/iqCN6WKYkbNOF04vFLJhwcpYUh9ydh/+5vpOqV4YQ== - graphemer@^1.4.0: version "1.4.0" resolved "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz#fb2f1d55e0e3a1849aeffc90c4fa0dd53a0e66c6" @@ -3971,7 +3950,7 @@ hosted-git-info@^4.0.1: dependencies: lru-cache "^6.0.0" -html-escaper@^2.0.0: +html-escaper@^2.0.0, html-escaper@^2.0.2: version "2.0.2" resolved "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== @@ -4379,10 +4358,10 @@ ix@5.0.0: "@types/node" "^13.7.4" tslib "^2.3.0" -jackspeak@^2.0.3: - version "2.3.3" - resolved "https://registry.npmjs.org/jackspeak/-/jackspeak-2.3.3.tgz#95e4cbcc03b3eb357bf6bcce14a903fb3d1151e1" - integrity sha512-R2bUw+kVZFS/h1AZqBKrSgDmdmjApzgY0AlCPumopFiAlbUxE2gf+SCuBzQ0cP5hHmUmFYF5yw55T97Th5Kstg== +jackspeak@^2.3.5: + version "2.3.6" + resolved "https://registry.yarnpkg.com/jackspeak/-/jackspeak-2.3.6.tgz#647ecc472238aee4b06ac0e461acc21a8c505ca8" + integrity sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ== dependencies: "@isaacs/cliui" "^8.0.2" optionalDependencies: @@ -5028,31 +5007,11 @@ lodash.clone@^4.3.2: resolved "https://registry.npmjs.org/lodash.clone/-/lodash.clone-4.5.0.tgz#195870450f5a13192478df4bc3d23d2dea1907b6" integrity sha512-GhrVeweiTD6uTmmn5hV/lzgCQhccwReIVRLHp7LT4SopOjqEZ5BbX8b5WWEtAKasjmy8hR7ZPwsYlxRCku5odg== -lodash.debounce@^4.0.8: - version "4.0.8" - resolved "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz#82d79bff30a67c4005ffd5e2515300ad9ca4d7af" - integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow== - -lodash.escape@^4.0.1: - version "4.0.1" - resolved "https://registry.npmjs.org/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98" - integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw== - -lodash.flatten@^4.4.0: - version "4.4.0" - resolved "https://registry.npmjs.org/lodash.flatten/-/lodash.flatten-4.4.0.tgz#f31c22225a9632d2bbf8e4addbef240aa765a61f" - integrity sha512-C5N2Z3DgnnKr0LOpv/hKCgKdb7ZZwafIrsesve6lmzvZIRZRGaZ/l6Q8+2W7NaT+ZwO3fFlSCzCzrDCFdJfZ4g== - lodash.get@^4.4.2: version "4.4.2" resolved "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz#2d177f652fa31e939b4438d5341499dfa3825e99" integrity sha512-z+Uw/vLuy6gQe8cfaFWD7p0wVv8fJl3mbzXh33RS+0oW2wvUqiRXiQ69gLWSLpgB5/6sU+r6BlQR0MBILadqTQ== -lodash.invokemap@^4.6.0: - version "4.6.0" - resolved "https://registry.npmjs.org/lodash.invokemap/-/lodash.invokemap-4.6.0.tgz#1748cda5d8b0ef8369c4eb3ec54c21feba1f2d62" - integrity sha512-CfkycNtMqgUlfjfdh2BhKO/ZXrP8ePOX5lEU/g0R3ItJcnuxWDwokMGKx1hWcfOikmyOVx6X9IwWnDGlgKl61w== - lodash.memoize@4.x: version "4.1.2" resolved "https://registry.npmjs.org/lodash.memoize/-/lodash.memoize-4.1.2.tgz#bcc6c49a42a2840ed997f323eada5ecd182e0bfe" @@ -5063,22 +5022,12 @@ lodash.merge@^4.6.2: resolved "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a" integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ== -lodash.pullall@^4.2.0: - version "4.2.0" - resolved "https://registry.npmjs.org/lodash.pullall/-/lodash.pullall-4.2.0.tgz#9d98b8518b7c965b0fae4099bd9fb7df8bbf38ba" - integrity sha512-VhqxBKH0ZxPpLhiu68YD1KnHmbhQJQctcipvmFnqIBDYzcIHzf3Zpu0tpeOKtR4x76p9yohc506eGdOjTmyIBg== - lodash.some@^4.2.2: version "4.6.0" resolved "https://registry.npmjs.org/lodash.some/-/lodash.some-4.6.0.tgz#1bb9f314ef6b8baded13b549169b2a945eb68e4d" integrity sha512-j7MJE+TuT51q9ggt4fSgVqro163BEFjAt3u97IqU+JA2DkWl80nFTrowzLpZ/BnpN7rrl0JA/593NAdd8p/scQ== -lodash.uniqby@^4.7.0: - version "4.7.0" - resolved "https://registry.npmjs.org/lodash.uniqby/-/lodash.uniqby-4.7.0.tgz#d99c07a669e9e6d24e1362dfe266c67616af1302" - integrity sha512-e/zcLx6CSbmaEgFHCA7BnoQKyCtKMxnuWrJygbwPs/AIn+IMKl66L8/s+wBUn5LRw2pZx3bUHibiV1b6aTWIww== - -lodash@^4.17.21, lodash@^4.17.4: +lodash@^4.17.4: version "4.17.21" resolved "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c" integrity sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg== @@ -5327,11 +5276,6 @@ minimist@1.x: resolved "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz#c1a464e7693302e082a075cee0c057741ac4772c" integrity sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA== -"minipass@^5.0.0 || ^6.0.2": - version "6.0.2" - resolved "https://registry.npmjs.org/minipass/-/minipass-6.0.2.tgz#542844b6c4ce95b202c0995b0a471f1229de4c81" - integrity sha512-MzWSV5nYVT7mVyWCwn2o7JH13w2TBRmmSqSRCKzTw+lmft9X4z+3wjvs06Tzijo5z4W/kahUCDpRXTF+ZrmF/w== - "minipass@^5.0.0 || ^6.0.2 || ^7.0.0": version "7.0.3" resolved "https://registry.npmjs.org/minipass/-/minipass-7.0.3.tgz#05ea638da44e475037ed94d1c7efcc76a25e1974" @@ -5739,9 +5683,9 @@ path-root@^0.1.1: dependencies: path-root-regex "^0.1.0" -path-scurry@^1.7.0: +path-scurry@^1.10.1: version "1.10.1" - resolved "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.1.tgz#9ba6bf5aa8500fe9fd67df4f0d9483b2b0bfc698" + resolved "https://registry.yarnpkg.com/path-scurry/-/path-scurry-1.10.1.tgz#9ba6bf5aa8500fe9fd67df4f0d9483b2b0bfc698" integrity sha512-MkhCqzzBEpPvxxQ71Md0b1Kk51W01lrYvlMzSUaIzNsODdd7mqhiimSZlr+VegAz5Z6Vzt9Xg2ttE//XBhH3EQ== dependencies: lru-cache "^9.1.1 || ^10.0.0" @@ -6066,9 +6010,9 @@ regex-not@^1.0.0, regex-not@^1.0.2: extend-shallow "^3.0.2" safe-regex "^1.1.0" -regexp-tree@^0.1.24, regexp-tree@~0.1.1: +regexp-tree@^0.1.27: version "0.1.27" - resolved "https://registry.npmjs.org/regexp-tree/-/regexp-tree-0.1.27.tgz#2198f0ef54518ffa743fe74d983b56ffd631b6cd" + resolved "https://registry.yarnpkg.com/regexp-tree/-/regexp-tree-0.1.27.tgz#2198f0ef54518ffa743fe74d983b56ffd631b6cd" integrity sha512-iETxpjK6YoRWJG5o6hXLwvjYAoW+FEZn9os0PD/b6AP6xQwsa/Y7lCVgIixBbUPMfhu+i2LtdeAqVTgGlQarfA== regjsparser@^0.10.0: @@ -6274,13 +6218,6 @@ safe-regex@^1.1.0: dependencies: ret "~0.1.10" -safe-regex@^2.1.1: - version "2.1.1" - resolved "https://registry.npmjs.org/safe-regex/-/safe-regex-2.1.1.tgz#f7128f00d056e2fe5c11e81a1324dd974aadced2" - integrity sha512-rx+x8AMzKb5Q5lQ95Zoi6ZbJqwCLkqi3XuJXp5P3rT8OEc6sZCJG5AE5dU3lsgRr/F4Bs31jSlVN+j5KrsGu9A== - dependencies: - regexp-tree "~0.1.1" - sax@>=0.6.0: version "1.2.4" resolved "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" @@ -6312,7 +6249,7 @@ semver@^6.3.0, semver@^6.3.1: resolved "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4" integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA== -semver@^7.3.4, semver@^7.3.7, semver@^7.3.8, semver@^7.5.3, semver@^7.5.4: +semver@^7.3.4, semver@^7.3.7, semver@^7.5.3, semver@^7.5.4: version "7.5.4" resolved "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz#483986ec4ed38e1c6c48c34894a9182dbff68a6e" integrity sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA== @@ -7020,10 +6957,10 @@ typedoc@0.24.8: minimatch "^9.0.0" shiki "^0.14.1" -typescript@5.1.3: - version "5.1.3" - resolved "https://registry.npmjs.org/typescript/-/typescript-5.1.3.tgz#8d84219244a6b40b6fb2b33cc1c062f715b9e826" - integrity sha512-XH627E9vkeqhlZFQuL+UsyAXEnibT0kWR2FWONlr4sTjvxyJYnyefgrkyECLzM5NenmKzRAy2rR/OlYLA1HkZw== +typescript@5.1.6: + version "5.1.6" + resolved "https://registry.yarnpkg.com/typescript/-/typescript-5.1.6.tgz#02f8ac202b6dad2c0dd5e0913745b47a37998274" + integrity sha512-zaWCozRZ6DLEWAWFrVDz1H6FVXzUSfTy5FUMWsQlU8Ym5JP9eO4xkTIROFCQvhQf61z6O/G6ugw3SgAnvvm+HA== typical@^4.0.0: version "4.0.0" @@ -7285,24 +7222,20 @@ web-streams-polyfill@3.2.1: resolved "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz#71c2718c52b45fd49dbeee88634b3a60ceab42a6" integrity sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q== -webpack-bundle-analyzer@4.9.1: - version "4.9.1" - resolved "https://registry.npmjs.org/webpack-bundle-analyzer/-/webpack-bundle-analyzer-4.9.1.tgz#d00bbf3f17500c10985084f22f1a2bf45cb2f09d" - integrity sha512-jnd6EoYrf9yMxCyYDPj8eutJvtjQNp8PHmni/e/ulydHBWhT5J3menXt3HEkScsu9YqMAcG4CfFjs3rj5pVU1w== +webpack-bundle-analyzer@4.10.1: + version "4.10.1" + resolved "https://registry.yarnpkg.com/webpack-bundle-analyzer/-/webpack-bundle-analyzer-4.10.1.tgz#84b7473b630a7b8c21c741f81d8fe4593208b454" + integrity sha512-s3P7pgexgT/HTUSYgxJyn28A+99mmLq4HsJepMPzu0R8ImJc52QNqaFYW1Z2z2uIb1/J3eYgaAWVpaC+v/1aAQ== dependencies: "@discoveryjs/json-ext" "0.5.7" acorn "^8.0.4" acorn-walk "^8.0.0" commander "^7.2.0" + debounce "^1.2.1" escape-string-regexp "^4.0.0" gzip-size "^6.0.0" + html-escaper "^2.0.2" is-plain-object "^5.0.0" - lodash.debounce "^4.0.8" - lodash.escape "^4.0.1" - lodash.flatten "^4.4.0" - lodash.invokemap "^4.6.0" - lodash.pullall "^4.2.0" - lodash.uniqby "^4.7.0" opener "^1.5.2" picocolors "^1.0.0" sirv "^2.0.3" diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc index bc5ab093b4534..cdeb4e03fa666 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc @@ -60,7 +60,7 @@ namespace arrow::matlab::array::proxy { /* * Display primitive and string types horizontally without * opening and closing delimiters. Use " | " as the delimiter - * between elments. Below is an example Int32Array display: + * between elements. Below is an example Int32Array display: * * 1 | 2 | 3 | ... | 6 | 7 | 8 */ diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/time_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/time_array.h index 68713a3dcce34..696f021921e23 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/time_array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/time_array.h @@ -48,7 +48,7 @@ namespace arrow::matlab::array::proxy { const std::u16string& time_unit_utf16 = time_unit_mda[0]; MATLAB_ASSIGN_OR_ERROR(const auto time_unit, timeUnitFromString(time_unit_utf16), - error::UKNOWN_TIME_UNIT_ERROR_ID); + error::UNKNOWN_TIME_UNIT_ERROR_ID); MATLAB_ERROR_IF_NOT_OK(validateTimeUnit(time_unit), error::INVALID_TIME_UNIT); diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc index 45a5b6ab92239..24abb4161a865 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/timestamp_array.cc @@ -49,7 +49,7 @@ namespace arrow::matlab::array::proxy { const std::u16string& u16_timeunit = units_mda[0]; MATLAB_ASSIGN_OR_ERROR(const auto time_unit, arrow::matlab::type::timeUnitFromString(u16_timeunit), - error::UKNOWN_TIME_UNIT_ERROR_ID) + error::UNKNOWN_TIME_UNIT_ERROR_ID) // create the timestamp_type auto data_type = arrow::timestamp(time_unit, timezone); diff --git a/matlab/src/cpp/arrow/matlab/bit/pack.cc b/matlab/src/cpp/arrow/matlab/bit/pack.cc index e80ce723658e5..d47f2955f1081 100644 --- a/matlab/src/cpp/arrow/matlab/bit/pack.cc +++ b/matlab/src/cpp/arrow/matlab/bit/pack.cc @@ -41,7 +41,7 @@ namespace arrow::matlab::bit { } } - // Pack an unpacked MATLAB logical array into into a bit-packed arrow::Buffer. + // Pack an unpacked MATLAB logical array into a bit-packed arrow::Buffer. arrow::Result> pack(const ::matlab::data::TypedArray matlab_logical_array) { // Validate that the input arrow::Buffer has sufficient size to store a full bit-packed // representation of the input MATLAB logical array. diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index e6be411b62a05..c9eaf1d9640d4 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -170,7 +170,7 @@ namespace arrow::matlab::error { static const char* UNICODE_CONVERSION_ERROR_ID = "arrow:matlab:unicode:UnicodeConversion"; static const char* STRING_BUILDER_APPEND_FAILED = "arrow:matlab:array:string:StringBuilderAppendFailed"; static const char* STRING_BUILDER_FINISH_FAILED = "arrow:matlab:array:string:StringBuilderFinishFailed"; - static const char* UKNOWN_TIME_UNIT_ERROR_ID = "arrow:matlab:UnknownTimeUnit"; + static const char* UNKNOWN_TIME_UNIT_ERROR_ID = "arrow:matlab:UnknownTimeUnit"; static const char* INVALID_TIME_UNIT = "arrow:type:InvalidTimeUnit"; static const char* FIELD_FAILED_TO_CREATE_TYPE_PROXY = "arrow:field:FailedToCreateTypeProxy"; static const char* ARRAY_FAILED_TO_CREATE_TYPE_PROXY = "arrow:array:FailedToCreateTypeProxy"; diff --git a/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h index 824b6c19a7109..aaa0d57b119a3 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h +++ b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h @@ -56,11 +56,11 @@ namespace arrow::matlab::tabular { ARROW_RETURN_NOT_OK(arrow::PrettyPrint(*slice, opts, &ss)); } else if (type_id == arrow::Type::type::STRUCT) { // Use as a placeholder since we don't have a good - // way to display StructArray elements horiztonally on screen. + // way to display StructArray elements horizontally on screen. ss << ""; } else if (type_id == arrow::Type::type::LIST) { // Use as a placeholder since we don't have a good - // way to display ListArray elements horiztonally on screen. + // way to display ListArray elements horizontally on screen. ss << ""; } else { return arrow::Status::NotImplemented("Datatype " + column->type()->ToString() + "is not currently supported for display."); diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc index 023381e005969..db7486bdb1bb7 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/schema.cc @@ -129,7 +129,7 @@ namespace arrow::matlab::tabular::proxy { std::vector field_names_utf16; field_names_utf16.reserve(num_fields); - // Conver the field names from UTF-8 to UTF-16. + // Convert the field names from UTF-8 to UTF-16. for (const auto& field_name_utf8 : field_names_utf8) { MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto field_name_utf16, arrow::util::UTF8StringToUTF16(field_name_utf8), context, error::UNICODE_CONVERSION_ERROR_ID); field_names_utf16.push_back(field_name_utf16); diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/time_type.cc b/matlab/src/cpp/arrow/matlab/type/proxy/time_type.cc index 84c875ba489ad..16d5cc3193e9b 100644 --- a/matlab/src/cpp/arrow/matlab/type/proxy/time_type.cc +++ b/matlab/src/cpp/arrow/matlab/type/proxy/time_type.cc @@ -53,7 +53,7 @@ namespace arrow::matlab::type::proxy { const std::u16string& time_unit_utf16 = time_unit_mda[0]; MATLAB_ASSIGN_OR_ERROR(const auto timeunit, timeUnitFromString(time_unit_utf16), - error::UKNOWN_TIME_UNIT_ERROR_ID); + error::UNKNOWN_TIME_UNIT_ERROR_ID); // validate timeunit MATLAB_ERROR_IF_NOT_OK(validateTimeUnit(timeunit), diff --git a/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc b/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc index 9ae5fcde67937..7397e96826e36 100644 --- a/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc +++ b/matlab/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc @@ -48,7 +48,7 @@ namespace arrow::matlab::type::proxy { const std::u16string& utf16_timeunit = timeunit_mda[0]; MATLAB_ASSIGN_OR_ERROR(const auto timeunit, arrow::matlab::type::timeUnitFromString(utf16_timeunit), - error::UKNOWN_TIME_UNIT_ERROR_ID); + error::UNKNOWN_TIME_UNIT_ERROR_ID); auto type = arrow::timestamp(timeunit, timezone); auto time_type = std::static_pointer_cast(type); diff --git a/matlab/src/matlab/+arrow/+array/StructArray.m b/matlab/src/matlab/+arrow/+array/StructArray.m index a92243faaadd1..70b2c16a96e3c 100644 --- a/matlab/src/matlab/+arrow/+array/StructArray.m +++ b/matlab/src/matlab/+arrow/+array/StructArray.m @@ -100,7 +100,7 @@ end function nullSubVal = get.NullSubstitutionValue(obj) - % Return a cell array containing each field's type-specifc + % Return a cell array containing each field's type-specific % "null" value. For example, NaN is the type-specific null % value for Float32Arrays and Float64Arrays numFields = obj.NumFields; diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m index df6a11612043c..79065ba1c8cfd 100644 --- a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m @@ -20,7 +20,7 @@ opts.FullClassName(1, 1) string opts.ClassName(1, 1) string % When displaying heterogeneous arrays, only the name of the - % closest shared anscestor class is displayed in bold. All other + % closest shared ancestor class is displayed in bold. All other % class names are not bolded. opts.BoldFont(1, 1) logical end diff --git a/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m b/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m index 8f7557c18b658..ec5796edf086c 100644 --- a/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m +++ b/matlab/src/matlab/+arrow/+internal/+validate/nonsparse.m @@ -1,5 +1,5 @@ %NONSPARESE Verifies data is nonsparse. Otherwise throws an error with the -% identifier "arrrow:array:Sparse". +% identifier "arrow:array:Sparse". % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with diff --git a/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m b/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m index 1f57cee4d8622..3b8d0c5402513 100644 --- a/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m +++ b/matlab/src/matlab/+arrow/+internal/+validate/realnumeric.m @@ -1,5 +1,5 @@ %REALNUMERIC Verifies the numeric array data is real. Otherwise throws an -% error with the identifier "arrrow:array:ComplexNumeric". +% error with the identifier "arrow:array:ComplexNumeric". % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with diff --git a/matlab/src/matlab/+arrow/+internal/+validate/shape.m b/matlab/src/matlab/+arrow/+internal/+validate/shape.m index d9a8e29076893..f75363d4c0004 100644 --- a/matlab/src/matlab/+arrow/+internal/+validate/shape.m +++ b/matlab/src/matlab/+arrow/+internal/+validate/shape.m @@ -1,5 +1,5 @@ %SHAPE Verifies data is either a vector or empty. Otherwise throws an error -% with the identifier "arrrow:array:InvalidShape". +% with the identifier "arrow:array:InvalidShape". % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with diff --git a/matlab/src/matlab/+arrow/+internal/+validate/type.m b/matlab/src/matlab/+arrow/+internal/+validate/type.m index 7977d812adbf6..50306d722c4a6 100644 --- a/matlab/src/matlab/+arrow/+internal/+validate/type.m +++ b/matlab/src/matlab/+arrow/+internal/+validate/type.m @@ -1,5 +1,5 @@ %TYPE Verifies data has the expected class type. Otherwise throws an -% error with the identifier "arrrow:array:InvalidType". +% error with the identifier "arrow:array:InvalidType". % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m b/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m index 88f7b10806212..b8a2bcfdedacd 100644 --- a/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m +++ b/matlab/src/matlab/+arrow/+tabular/+internal/makeValidDimensionNames.m @@ -20,9 +20,9 @@ dimnames = ["Row" "Variables"]; numvars = numel(varnames); - indicesToUniqify = [numvars + 1 numvars + 2]; + indicesToUniquify = [numvars + 1 numvars + 2]; - strs = matlab.lang.makeUniqueStrings([varnames dimnames], indicesToUniqify); - dimnames = strs(indicesToUniqify); + strs = matlab.lang.makeUniqueStrings([varnames dimnames], indicesToUniquify); + dimnames = strs(indicesToUniquify); end diff --git a/matlab/src/matlab/+arrow/+type/Type.m b/matlab/src/matlab/+arrow/+type/Type.m index 6dc4fbc438f34..2a2ba145a644d 100644 --- a/matlab/src/matlab/+arrow/+type/Type.m +++ b/matlab/src/matlab/+arrow/+type/Type.m @@ -82,7 +82,7 @@ else % Check if every type in the array has the same class type. % If so, call getDisplayPropertyGroups() so that all - % properties assoicated with that class are displayed. + % properties associated with that class are displayed. classnames = arrayfun(@(type) string(class(type)), obj); if numel(unique(classnames)) == 1 groups = getDisplayPropertyGroups(obj(1)); diff --git a/matlab/src/matlab/+arrow/+util/table2mlarrow.m b/matlab/src/matlab/+arrow/+util/table2mlarrow.m index 391b0603ea2c4..a629cc4fb8852 100644 --- a/matlab/src/matlab/+arrow/+util/table2mlarrow.m +++ b/matlab/src/matlab/+arrow/+util/table2mlarrow.m @@ -6,7 +6,7 @@ % Takes a MATLAB table T and returns struct array equivalents % which are suitable for passing to the mlarrow C++ MEX layer. % -% VARIABLES is an 1xN struct array representing the the table variables. +% VARIABLES is an 1xN struct array representing the table variables. % % VARIABLES contains the following fields: % diff --git a/matlab/test/arrow/array/list/tTableValidator.m b/matlab/test/arrow/array/list/tTableValidator.m index b3aeac9b6728c..fcebaa9046d0a 100644 --- a/matlab/test/arrow/array/list/tTableValidator.m +++ b/matlab/test/arrow/array/list/tTableValidator.m @@ -255,7 +255,7 @@ function validateElementNestedTableVariable(testCase) %#ok end function GetElementLength(testCase) - % Verify GetElementLength returns the the number of rows as the + % Verify GetElementLength returns the number of rows as the % length of the element. import arrow.array.internal.list.TableValidator diff --git a/matlab/test/arrow/array/tChunkedArray.m b/matlab/test/arrow/array/tChunkedArray.m index cf11412ebb36b..c37206241b413 100644 --- a/matlab/test/arrow/array/tChunkedArray.m +++ b/matlab/test/arrow/array/tChunkedArray.m @@ -112,7 +112,7 @@ function TestIsEqualTrue(testCase) % Verifies ChunkedArrays are considered equal if: % % 1. Their Type properties are equal - % 2. Their NumElements properties ar eequal + % 2. Their NumElements properties are equal % 3. The same elements are considered null % 4. All corresponding valid elements have the same values % diff --git a/matlab/test/arrow/array/tFloat32Array.m b/matlab/test/arrow/array/tFloat32Array.m index 8a583c6b64a15..8c9c0f1f362de 100644 --- a/matlab/test/arrow/array/tFloat32Array.m +++ b/matlab/test/arrow/array/tFloat32Array.m @@ -102,7 +102,7 @@ function LogicalValidNVPair(testCase) testCase.verifyEqual(toMATLAB(arrowArray), single([NaN; 2; 3])); end - function NumericlValidNVPair(testCase) + function NumericValidNVPair(testCase) matlabArray = single([1 2 3]); % Supply a numeric vector for Valid diff --git a/matlab/test/arrow/array/tFloat64Array.m b/matlab/test/arrow/array/tFloat64Array.m index ec2871a124894..84dfe57e6ca87 100755 --- a/matlab/test/arrow/array/tFloat64Array.m +++ b/matlab/test/arrow/array/tFloat64Array.m @@ -107,7 +107,7 @@ function LogicalValidNVPair(testCase) testCase.verifyEqual(toMATLAB(arrowArray), [NaN; 2; 3]); end - function NumericlValidNVPair(testCase) + function NumericValidNVPair(testCase) matlabArray = [1 2 3]; % Supply a numeric vector for Valid diff --git a/matlab/test/arrow/array/tListArray.m b/matlab/test/arrow/array/tListArray.m index 07304eb384299..0b5a74313fc06 100644 --- a/matlab/test/arrow/array/tListArray.m +++ b/matlab/test/arrow/array/tListArray.m @@ -201,7 +201,7 @@ function TestValidationModeDefault(testCase, TestValidationModeArray) function TestValidationModeNone(testCase, TestValidationModeArray) % Verify that no error is thrown when supplying the - % ValidatationMode name-value pair, with a value of + % ValidationMode name-value pair, with a value of % arrow.array.ValidationMode.None, to the % arrow.array.ListArray.fromArrays method. offsets = TestValidationModeArray.Offsets; @@ -213,7 +213,7 @@ function TestValidationModeNone(testCase, TestValidationModeArray) function TestValidationModeMinimal(testCase, TestValidationModeArray) % Verify that an error of type arrow:array:ValidateMinimalFailed - % is thrown when supplying the ValidatationMode name-value pair, + % is thrown when supplying the ValidationMode name-value pair, % with a value of arrow.array.ValidationMode.Minimal, to the % arrow.array.ListArray.fromArrays method, if the provided offsets % and values arrays are invalid. @@ -231,7 +231,7 @@ function TestValidationModeMinimal(testCase, TestValidationModeArray) function TestValidationModeFull(testCase, TestValidationModeArray) % Verify that an error of type arrow:array:ValidateFullFailed - % is thrown when supplying the ValidatationMode name-value pair, + % is thrown when supplying the ValidationMode name-value pair, % with a value of arrow.array.ValidationMode.Full, to the % arrow.array.ListArray.fromArrays method, if the provided offsets % and values arrays are invalid. @@ -250,7 +250,7 @@ function TestValidationModeFull(testCase, TestValidationModeArray) function TestValidationModeUnsupportedEnum(testCase) % Verify that an error of type arrow:array:ValidateUnsupportedEnum % is thrown when an unsupported integer enumeration value is - % supplied for the ValidatationMode parameter to the internal + % supplied for the ValidationMode parameter to the internal % C++ ListArray Proxy validate method. offsets = arrow.array.Int32Array.fromMATLAB(int32([0, 1, 2])); values = arrow.array.Float64Array.fromMATLAB([1, 2, 3]); diff --git a/matlab/test/arrow/array/tTime32Array.m b/matlab/test/arrow/array/tTime32Array.m index 24c3508a86015..99d5839974008 100644 --- a/matlab/test/arrow/array/tTime32Array.m +++ b/matlab/test/arrow/array/tTime32Array.m @@ -271,7 +271,7 @@ function TestIsEqualFalse(tc, Unit) tc.verifyFalse(isequal(array1, array1, array3, array4, array5)); end - function TestIsEqualFalseTimeUnitMistmatch(tc) + function TestIsEqualFalseTimeUnitMismatch(tc) % Verify two Time32Arrays are not considered equal if they have % different TimeUnit values. diff --git a/matlab/test/arrow/array/tTime64Array.m b/matlab/test/arrow/array/tTime64Array.m index 3f66ebd638c65..816b7acddfc86 100644 --- a/matlab/test/arrow/array/tTime64Array.m +++ b/matlab/test/arrow/array/tTime64Array.m @@ -128,10 +128,10 @@ function TestDuration(testCase, Unit) function TestValid(testCase, Unit) % Verify the Valid property returns the expected logical vector. times = seconds([100 200 NaN 355 NaN 400]); - arrray = testCase.ArrowArrayConstructorFcn(times, TImeUnit=Unit); - testCase.verifyEqual(arrray.Valid, [true; true; false; true; false; true]); - testCase.verifyEqual(toMATLAB(arrray), times'); - testCase.verifyEqual(duration(arrray), times'); + array = testCase.ArrowArrayConstructorFcn(times, TimeUnit=Unit); + testCase.verifyEqual(array.Valid, [true; true; false; true; false; true]); + testCase.verifyEqual(toMATLAB(array), times'); + testCase.verifyEqual(duration(array), times'); end function InferNullsTrueNVPair(testCase, Unit) @@ -299,7 +299,7 @@ function TestIsEqualFalse(tc, Unit) tc.verifyFalse(isequal(array1, array1, array3, array4, array5)); end - function TestIsEqualFalseTimeUnitMistmatch(tc) + function TestIsEqualFalseTimeUnitMismatch(tc) % Verify two Time64Arrays are not considered equal if they have % different TimeUnit values. times1 = seconds([1 2 3 4]); diff --git a/matlab/test/arrow/array/tTimestampArray.m b/matlab/test/arrow/array/tTimestampArray.m index 54af192f935a5..cfb18cdcefac0 100644 --- a/matlab/test/arrow/array/tTimestampArray.m +++ b/matlab/test/arrow/array/tTimestampArray.m @@ -234,7 +234,7 @@ function TestIsEqualFalse(tc, TimeZone) tc.verifyFalse(isequal(array1, array1, array3, array4, array5)); end - function TestIsEqualFalseTimeZoneMistmatch(tc) + function TestIsEqualFalseTimeZoneMismatch(tc) % Verify two TimestampArrays are not considered equal if one % has a TimeZone and one does not. dates1 = datetime(2023, 6, 22, TimeZone="America/Anchorage") + days(0:4); @@ -263,7 +263,7 @@ function TestIsEqualSameInstantDifferentTimeZone(tc) tc.verifyFalse(isequal(array1, array2)); end - function TestIsEqualFalseTimeUnitMistmatch(tc, TimeZone) + function TestIsEqualFalseTimeUnitMismatch(tc, TimeZone) % Verify two TimestampArrays are not considered equal if their % TimeUnit values differ. dates1 = datetime(2023, 6, 22, TimeZone=TimeZone) + days(0:4); diff --git a/matlab/test/arrow/internal/validate/index/tNumeric.m b/matlab/test/arrow/internal/validate/index/tNumeric.m index e3d1c9d7e51b2..d4e81fa2e196c 100644 --- a/matlab/test/arrow/internal/validate/index/tNumeric.m +++ b/matlab/test/arrow/internal/validate/index/tNumeric.m @@ -20,7 +20,7 @@ methods (Test) function NonPositiveError(testCase) - % Verify numeric() throws an error whose idenitifier is + % Verify numeric() throws an error whose identifier is % "arrow:badsubscript:NonPositive" if the index array provided % has non-positive values. @@ -39,7 +39,7 @@ function NonPositiveError(testCase) end function NonIntegerError(testCase) - % Verify numeric() throws an error whose idenitifier is + % Verify numeric() throws an error whose identifier is % "arrow:badsubscript:NonInteger" if the index array provided % has non-integer values. @@ -61,7 +61,7 @@ function NonIntegerError(testCase) end function NonRealError(testCase) - % Verify numeric() throws an error whose idenitifier is + % Verify numeric() throws an error whose identifier is % "arrow:badsubscript:NonReal" if the index array is % complex. @@ -77,7 +77,7 @@ function NonRealError(testCase) end function ExceedsIntMaxError(testCase) - % Verify numeric() throws an error whose idenitifier is + % Verify numeric() throws an error whose identifier is % "arrow:badsubscript:ExceedsIntMax" if the index array % provided has values that exceed the intmax(intType). @@ -122,7 +122,7 @@ function ConvertSparseToFullStorage(testCase) end function ErrorIfNonNumeric(testCase) - % Verify numeric() throws an error whose idenitifer is + % Verify numeric() throws an error whose identifier is % "arrow:badsubscript:NonNumeric" if provided a non-numeric % array as the index. diff --git a/matlab/test/arrow/internal/validate/index/tString.m b/matlab/test/arrow/internal/validate/index/tString.m index e8e27024ea84f..660b2108fde55 100644 --- a/matlab/test/arrow/internal/validate/index/tString.m +++ b/matlab/test/arrow/internal/validate/index/tString.m @@ -20,9 +20,9 @@ methods(Test) function MissingStringError(testCase) - % Verify string() throws an error whose idenitifier is + % Verify string() throws an error whose identifier is % "arrow:badsubscript:MissingString" if the index array - % provided has mising string values. + % provided has missing string values. import arrow.internal.validate.* @@ -74,7 +74,7 @@ function ValidStringIndices(testCase) end function ErrorIfNonString(testCase) - % Verify string() throws an error whose idenitifer is + % Verify string() throws an error whose identifier is % "arrow:badsubscript:NonString" if neither a string array, % char array, nor cellstr array was provided as the index. diff --git a/matlab/test/arrow/tabular/tRecordBatch.m b/matlab/test/arrow/tabular/tRecordBatch.m index 700d1aac4c7c8..94166f6f3368a 100644 --- a/matlab/test/arrow/tabular/tRecordBatch.m +++ b/matlab/test/arrow/tabular/tRecordBatch.m @@ -235,7 +235,7 @@ function FromArraysNoInputs(testCase) end function Schema(tc) - % Verify that the public Schema property returns an approprate + % Verify that the public Schema property returns an appropriate % instance of arrow.tabular.Schema. t = table(["A"; "B"; "C"], ... [1; 2; 3], ... @@ -331,7 +331,7 @@ function ErrorIfColumnNameDoesNotExist(testCase) ColumnNames=["A", "B", "C"] ... ); - % Matching should be case sensitive. + % Matching should be case-sensitive. name = "a"; testCase.verifyError(@() recordBatch.column(name), "arrow:tabular:schema:AmbiguousFieldName"); diff --git a/matlab/test/arrow/tabular/tSchema.m b/matlab/test/arrow/tabular/tSchema.m index bb95c1823b9fc..4acb17b3a236d 100644 --- a/matlab/test/arrow/tabular/tSchema.m +++ b/matlab/test/arrow/tabular/tSchema.m @@ -53,7 +53,7 @@ function ClassType(testCase) function ConstructSchemaFromProxy(testCase) % Verify that an arrow.tabular.Schema instance can be - % constructred directly from an existing + % constructed directly from an existing % arrow.tabular.proxy.Schema Proxy instance. schema1 = arrow.schema(arrow.field("a", arrow.uint8)); % Construct an instance of arrow.tabular.Schema directly from a @@ -117,7 +117,7 @@ function FieldsNoSetter(testCase) end function NumFields(testCase) - % Verify that the NumFields property returns an execpted number + % Verify that the NumFields property returns an expected number % of fields. schema = arrow.schema([... arrow.field("A", arrow.uint8), ... @@ -262,7 +262,7 @@ function ErrorIfFieldNameDoesNotExist(testCase) arrow.field("C", arrow.uint32) ... ]); - % Matching should be case sensitive. + % Matching should be case-sensitive. fieldName = "a"; testCase.verifyError(@() schema.field(fieldName), "arrow:tabular:schema:AmbiguousFieldName"); diff --git a/matlab/test/arrow/tabular/tTable.m b/matlab/test/arrow/tabular/tTable.m index ee3cc572f2af8..63b21bdc09500 100644 --- a/matlab/test/arrow/tabular/tTable.m +++ b/matlab/test/arrow/tabular/tTable.m @@ -333,7 +333,7 @@ function ErrorIfColumnNameDoesNotExist(testCase) ColumnNames=["A", "B", "C"] ... ); - % Matching should be case sensitive. + % Matching should be case-sensitive. name = "a"; testCase.verifyError(@() arrowTable.column(name), "arrow:tabular:schema:AmbiguousFieldName"); @@ -547,7 +547,7 @@ function ConstructionFunctionNoInputs(testCase) end function Schema(testCase) - % Verify that the public Schema property returns an approprate + % Verify that the public Schema property returns an appropriate % instance of arrow.tabular.Table. matlabTable = table(... ["A"; "B"; "C"], ... diff --git a/matlab/test/arrow/tabular/tTabularDisplay.m b/matlab/test/arrow/tabular/tTabularDisplay.m index 027517edeb2d6..a6d9f23b244b3 100644 --- a/matlab/test/arrow/tabular/tTabularDisplay.m +++ b/matlab/test/arrow/tabular/tTabularDisplay.m @@ -129,7 +129,7 @@ function OneRowOneColumn(testCase, TabularType) end function MultipleRowsAndColumns(testCase, TabularType) - % Verify tabular object display when the object has mulitple rows + % Verify tabular object display when the object has multiple rows % and columns. Only the first row is displayed. All columns are % displayed. import arrow.internal.test.display.makeLinkString @@ -156,7 +156,7 @@ function MultipleRowsAndColumns(testCase, TabularType) end function VeryWideTabular(testCase, TabularType) - % Verify that all variables are displayed without any trucation + % Verify that all variables are displayed without any truncation % even when the tabular object is wider than the MATLAB Command % Window. import arrow.internal.test.display.makeLinkString diff --git a/matlab/tools/addInstallDirToSearchPath.m b/matlab/tools/addInstallDirToSearchPath.m index 67253439549d3..2f38007beecc7 100644 --- a/matlab/tools/addInstallDirToSearchPath.m +++ b/matlab/tools/addInstallDirToSearchPath.m @@ -24,7 +24,7 @@ function addInstallDirToSearchPath(installDirPath, addInstallDirToSearchPath, ad % Return exit code 1 to indicate savepath failure and 0 to indicate the path has % been saved successfully. if status == 0 - disp("Sucessfully added installation directory to the MATLAB Search Path: " + installDirPath); + disp("Successfully added installation directory to the MATLAB Search Path: " + installDirPath); quit(0); else quit(1); @@ -48,7 +48,7 @@ function addInstallDirToSearchPath(installDirPath, addInstallDirToSearchPath, ad % fopen failed. quit(2); end - disp("Sucessfully appended an addpath command to the MATLAB startup.m file located at the userpath to add installation directory to the MATLAB Search Path: " + installDirPath); + disp("Successfully appended an addpath command to the MATLAB startup.m file located at the userpath to add installation directory to the MATLAB Search Path: " + installDirPath); quit(0); end end diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 529265235c746..3f810d27271e5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,5 +1,5 @@ # Licensed to the Apache Software Foundation (ASF) under one -# or more cod ntributor license agreements. See the NOTICE file +# or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the diff --git a/python/examples/minimal_build/build_conda.sh b/python/examples/minimal_build/build_conda.sh index cd0030ac5fc65..72c3a5f9ea2cd 100755 --- a/python/examples/minimal_build/build_conda.sh +++ b/python/examples/minimal_build/build_conda.sh @@ -91,7 +91,7 @@ popd # Build and test Python library pushd $ARROW_ROOT/python -rm -rf build/ # remove any pesky pre-existing build directory +rm -rf build/ # remove any pesky preexisting build directory export CMAKE_PREFIX_PATH=${ARROW_HOME}${CMAKE_PREFIX_PATH:+:${CMAKE_PREFIX_PATH}} export PYARROW_BUILD_TYPE=Debug diff --git a/python/examples/minimal_build/build_venv.sh b/python/examples/minimal_build/build_venv.sh index d0432049f7a86..3bd641d0e72c9 100755 --- a/python/examples/minimal_build/build_venv.sh +++ b/python/examples/minimal_build/build_venv.sh @@ -62,7 +62,7 @@ popd # Build and test Python library pushd $ARROW_ROOT/python -rm -rf build/ # remove any pesky pre-existing build directory +rm -rf build/ # remove any pesky preexisting build directory export CMAKE_PREFIX_PATH=${ARROW_HOME}${CMAKE_PREFIX_PATH:+:${CMAKE_PREFIX_PATH}} export PYARROW_BUILD_TYPE=Debug diff --git a/python/pyarrow/_acero.pyx b/python/pyarrow/_acero.pyx index bb3196c86ef0f..1c9b2f75c39f1 100644 --- a/python/pyarrow/_acero.pyx +++ b/python/pyarrow/_acero.pyx @@ -155,7 +155,7 @@ class ProjectNodeOptions(_ProjectNodeOptions): List of expressions to evaluate against the source batch. This must be scalar expressions. names : list of str, optional - List of names for each of the ouptut columns (same length as + List of names for each of the output columns (same length as `expressions`). If `names` is not provided, the string representations of exprs will be used. """ @@ -213,7 +213,7 @@ class AggregateNodeOptions(_AggregateNodeOptions): Parameters ---------- aggregates : list of tuples - Aggregations which will be applied to the targetted fields. + Aggregations which will be applied to the targeted fields. Specified as a list of tuples, where each tuple is one aggregation specification and consists of: aggregation target column(s) followed by function name, aggregation function options object and the diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 51dfdbf8ebbbe..a267d53599436 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1390,7 +1390,7 @@ class TakeOptions(_TakeOptions): ---------- boundscheck : boolean, default True Whether to check indices are within bounds. If False and an - index is out of boundes, behavior is undefined (the process + index is out of bounds, behavior is undefined (the process may crash). """ @@ -1468,7 +1468,7 @@ cdef class _StructFieldOptions(FunctionOptions): def _set_options(self, indices): if isinstance(indices, (list, tuple)) and not len(indices): - # Allow empty indices; effecitively return same array + # Allow empty indices; effectively return same array self.wrapped.reset( new CStructFieldOptions(indices)) return @@ -2991,7 +2991,7 @@ def register_aggregate_function(func, function_name, function_doc, in_types, out This is often used with ordered or segmented aggregation where groups can be emit before accumulating all of the input data. - Note that currently the size of any input column can not exceed 2 GB + Note that currently the size of any input column cannot exceed 2 GB for a single segment (all groups combined). Parameters @@ -3076,7 +3076,7 @@ def register_tabular_function(func, function_name, function_doc, in_types, out_t UdfContext and returning a generator of struct arrays. The in_types argument must be empty and the out_type argument specifies a schema. Each struct array must have field types - correspoding to the schema. + corresponding to the schema. Parameters ---------- diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx index dc7f42c10bfbe..ba799a105e7e1 100644 --- a/python/pyarrow/_cuda.pyx +++ b/python/pyarrow/_cuda.pyx @@ -493,7 +493,7 @@ cdef class CudaBuffer(Buffer): raise ValueError( 'requested more to copy than available from ' 'device buffer') - # copy nbytes starting from position to new host buffeer + # copy nbytes starting from position to new host buffer c_nbytes = nbytes buf = allocate_buffer(c_nbytes, memory_pool=memory_pool, resizable=resizable) diff --git a/python/pyarrow/_dataset.pxd b/python/pyarrow/_dataset.pxd index bee9fc1f0987a..220ab6b19affe 100644 --- a/python/pyarrow/_dataset.pxd +++ b/python/pyarrow/_dataset.pxd @@ -22,11 +22,10 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_dataset cimport * from pyarrow.lib cimport * -from pyarrow._fs cimport FileSystem +from pyarrow._fs cimport FileSystem, FileInfo -cdef CFileSource _make_file_source(object file, FileSystem filesystem=*) - +cdef CFileSource _make_file_source(object file, FileSystem filesystem=*, object file_size=*) cdef class DatasetFactory(_Weakrefable): diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index d7d69965d000a..b93f71969e8d3 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -32,7 +32,7 @@ from pyarrow.includes.libarrow_dataset cimport * from pyarrow._acero cimport ExecNodeOptions from pyarrow._compute cimport Expression, _bind from pyarrow._compute import _forbid_instantiation -from pyarrow._fs cimport FileSystem, FileSelector +from pyarrow._fs cimport FileSystem, FileSelector, FileInfo from pyarrow._csv cimport ( ConvertOptions, ParseOptions, ReadOptions, WriteOptions) from pyarrow.util import _is_iterable, _is_path_like, _stringify_path @@ -96,27 +96,33 @@ def _get_parquet_symbol(name): return _dataset_pq and getattr(_dataset_pq, name) -cdef CFileSource _make_file_source(object file, FileSystem filesystem=None): +cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, object file_size=None): cdef: CFileSource c_source shared_ptr[CFileSystem] c_filesystem + CFileInfo c_info c_string c_path shared_ptr[CRandomAccessFile] c_file shared_ptr[CBuffer] c_buffer + int64_t c_size if isinstance(file, Buffer): c_buffer = pyarrow_unwrap_buffer(file) c_source = CFileSource(move(c_buffer)) - elif _is_path_like(file): if filesystem is None: raise ValueError("cannot construct a FileSource from " "a path without a FileSystem") c_filesystem = filesystem.unwrap() c_path = tobytes(_stringify_path(file)) - c_source = CFileSource(move(c_path), move(c_filesystem)) + if file_size is not None: + c_size = file_size + c_info = FileInfo(c_path, size=c_size).unwrap() + c_source = CFileSource(move(c_info), move(c_filesystem)) + else: + c_source = CFileSource(move(c_path), move(c_filesystem)) elif hasattr(file, 'read'): # Optimistically hope this is file-like c_file = get_native_file(file, False).get_random_access_file() @@ -853,7 +859,7 @@ cdef class Dataset(_Weakrefable): Which suffix to add to right column names. This prevents confusion when the columns in left and right datasets have colliding names. right_suffix : str, default None - Which suffic to add to the left column names. This prevents confusion + Which suffix to add to the left column names. This prevents confusion when the columns in left and right datasets have colliding names. coalesce_keys : bool, default True If the duplicated keys should be omitted from one of the sides @@ -1016,7 +1022,7 @@ cdef class FileSystemDataset(Dataset): elif not isinstance(root_partition, Expression): raise TypeError( "Argument 'root_partition' has incorrect type (expected " - "Epression, got {0})".format(type(root_partition)) + "Expression, got {0})".format(type(root_partition)) ) for fragment in fragments: @@ -1230,7 +1236,7 @@ cdef class FileFormat(_Weakrefable): The schema inferred from the file """ cdef: - CFileSource c_source = _make_file_source(file, filesystem) + CFileSource c_source = _make_file_source(file, filesystem, file_size=None) CResult[shared_ptr[CSchema]] c_result with nogil: c_result = self.format.Inspect(c_source) @@ -1238,7 +1244,8 @@ cdef class FileFormat(_Weakrefable): return pyarrow_wrap_schema(move(c_schema)) def make_fragment(self, file, filesystem=None, - Expression partition_expression=None): + Expression partition_expression=None, + *, file_size=None): """ Make a FileFragment from a given file. @@ -1252,6 +1259,9 @@ cdef class FileFormat(_Weakrefable): partition_expression : Expression, optional An expression that is guaranteed true for all rows in the fragment. Allows fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. Returns ------- @@ -1260,8 +1270,7 @@ cdef class FileFormat(_Weakrefable): """ if partition_expression is None: partition_expression = _true - - c_source = _make_file_source(file, filesystem) + c_source = _make_file_source(file, filesystem, file_size) c_fragment = GetResultValue( self.format.MakeFragment(move(c_source), partition_expression.unwrap(), diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 31aa058706a87..d458ac4ee710d 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -235,7 +235,7 @@ cdef class ParquetFileFormat(FileFormat): return f"" def make_fragment(self, file, filesystem=None, - Expression partition_expression=None, row_groups=None): + Expression partition_expression=None, row_groups=None, *, file_size=None): """ Make a FileFragment from a given file. @@ -251,6 +251,9 @@ cdef class ParquetFileFormat(FileFormat): fragment to be potentially skipped while scanning with a filter. row_groups : Iterable, optional The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. Returns ------- @@ -259,15 +262,13 @@ cdef class ParquetFileFormat(FileFormat): """ cdef: vector[int] c_row_groups - if partition_expression is None: partition_expression = _true - if row_groups is None: return super().make_fragment(file, filesystem, - partition_expression) + partition_expression, file_size=file_size) - c_source = _make_file_source(file, filesystem) + c_source = _make_file_source(file, filesystem, file_size) c_row_groups = [ row_group for row_group in set(row_groups)] c_fragment = GetResultValue( @@ -415,7 +416,7 @@ cdef class ParquetFileFragment(FileFragment): the Parquet RowGroup statistics). schema : Schema, default None Schema to use when filtering row groups. Defaults to the - Fragment's phsyical schema + Fragment's physical schema Returns ------- @@ -450,7 +451,7 @@ cdef class ParquetFileFragment(FileFragment): the Parquet RowGroup statistics). schema : Schema, default None Schema to use when filtering row groups. Defaults to the - Fragment's phsyical schema + Fragment's physical schema row_group_ids : list of ints The row group IDs to include in the subset. Can only be specified if `filter` is None. @@ -606,6 +607,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): write_batch_size=self._properties["write_batch_size"], dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"], write_page_index=self._properties["write_page_index"], + write_page_checksum=self._properties["write_page_checksum"], ) def _set_arrow_properties(self): @@ -655,6 +657,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): dictionary_pagesize_limit=None, write_page_index=False, encryption_config=None, + write_page_checksum=False, ) self._set_properties() @@ -686,7 +689,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): pre_buffer : bool, default True If enabled, pre-buffer the raw Parquet data instead of issuing one read per column chunk. This can improve performance on high-latency - filesystems (e.g. S3, GCS) by coalesing and issuing file reads in + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in parallel using a background I/O thread pool. Set to False if you want to prioritize minimal memory usage over maximum speed. @@ -701,6 +704,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None If not None, use the provided ParquetDecryptionConfig to decrypt the Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. """ # Avoid mistakingly creating attributes @@ -711,7 +716,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): bint pre_buffer=True, thrift_string_size_limit=None, thrift_container_size_limit=None, - decryption_config=None): + decryption_config=None, + bint page_checksum_verification=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) self.use_buffered_stream = use_buffered_stream @@ -723,6 +729,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.thrift_container_size_limit = thrift_container_size_limit if decryption_config is not None: self.parquet_decryption_config = decryption_config + self.page_checksum_verification = page_checksum_verification cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): FragmentScanOptions.init(self, sp) @@ -802,6 +809,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): set_decryption_config(self, config) self._parquet_decryption_config = config + @property + def page_checksum_verification(self): + return self.reader_properties().page_checksum_verification() + + @page_checksum_verification.setter + def page_checksum_verification(self, bint page_checksum_verification): + self.reader_properties().set_page_checksum_verification(page_checksum_verification) + def equals(self, ParquetFragmentScanOptions other): """ Parameters @@ -814,11 +829,12 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): """ attrs = ( self.use_buffered_stream, self.buffer_size, self.pre_buffer, - self.thrift_string_size_limit, self.thrift_container_size_limit) + self.thrift_string_size_limit, self.thrift_container_size_limit, + self.page_checksum_verification) other_attrs = ( other.use_buffered_stream, other.buffer_size, other.pre_buffer, other.thrift_string_size_limit, - other.thrift_container_size_limit) + other.thrift_container_size_limit, other.page_checksum_verification) return attrs == other_attrs @staticmethod @@ -835,6 +851,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): pre_buffer=self.pre_buffer, thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, + page_checksum_verification=self.page_checksum_verification ) return ParquetFragmentScanOptions._reconstruct, (kwargs,) diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index 8fe9465a13d9c..a2ff045f256ac 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -2664,7 +2664,7 @@ cdef class TracingServerMiddlewareFactory(ServerMiddlewareFactory): cdef class ServerMiddleware(_Weakrefable): """Server-side middleware for a call, instantiated per RPC. - Methods here should be fast and must be infalliable: they should + Methods here should be fast and must be infallible: they should not raise exceptions or stall indefinitely. """ diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 39cdcc063b503..59b50ceda8c40 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -380,6 +380,9 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: shared_ptr[CFileDecryptionProperties] file_decryption_properties() \ const + c_bool page_checksum_verification() const + void set_page_checksum_verification(c_bool check_crc) + CReaderProperties default_reader_properties() cdef cppclass ArrowReaderProperties: @@ -428,6 +431,8 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit) Builder* enable_write_page_index() Builder* disable_write_page_index() + Builder* enable_page_checksum() + Builder* disable_page_checksum() shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: @@ -576,7 +581,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( FileEncryptionProperties encryption_properties=*, write_batch_size=*, dictionary_pagesize_limit=*, - write_page_index=*) except * + write_page_index=*, + write_page_checksum=*) except * cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 089ed7c75ce58..35344eb735516 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -487,7 +487,7 @@ cdef class ColumnChunkMetaData(_Weakrefable): @property def total_compressed_size(self): - """Compresssed size in bytes (int).""" + """Compressed size in bytes (int).""" return self.metadata.total_compressed_size() @property @@ -1183,7 +1183,8 @@ cdef class ParquetReader(_Weakrefable): coerce_int96_timestamp_unit=None, FileDecryptionProperties decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): """ Open a parquet file for reading. @@ -1199,6 +1200,7 @@ cdef class ParquetReader(_Weakrefable): decryption_properties : FileDecryptionProperties, optional thrift_string_size_limit : int, optional thrift_container_size_limit : int, optional + page_checksum_verification : bool, default False """ cdef: shared_ptr[CFileMetaData] c_metadata @@ -1236,6 +1238,8 @@ cdef class ParquetReader(_Weakrefable): arrow_props.set_pre_buffer(pre_buffer) + properties.set_page_checksum_verification(page_checksum_verification) + if coerce_int96_timestamp_unit is None: # use the default defined in default_arrow_reader_properties() pass @@ -1559,7 +1563,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( FileEncryptionProperties encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, - write_page_index=False) except *: + write_page_index=False, + write_page_checksum=False) except *: """General writer properties""" cdef: shared_ptr[WriterProperties] properties @@ -1650,7 +1655,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( if use_byte_stream_split: if column_encoding is not None: raise ValueError( - "'use_byte_stream_split' can not be passed" + "'use_byte_stream_split' cannot be passed" "together with 'column_encoding'") else: props.encoding(ParquetEncoding_BYTE_STREAM_SPLIT) @@ -1662,7 +1667,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( column_encoding[column] = 'BYTE_STREAM_SPLIT' else: raise ValueError( - "'use_byte_stream_split' can not be passed" + "'use_byte_stream_split' cannot be passed" "together with 'column_encoding'") # column_encoding @@ -1703,6 +1708,13 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( # a size larger than this then it will be latched to this value. props.max_row_group_length(_MAX_ROW_GROUP_SIZE) + # checksum + + if write_page_checksum: + props.enable_page_checksum() + else: + props.disable_page_checksum() + # page index if write_page_index: @@ -1822,7 +1834,8 @@ cdef class ParquetWriter(_Weakrefable): write_batch_size=None, dictionary_pagesize_limit=None, store_schema=True, - write_page_index=False): + write_page_index=False, + write_page_checksum=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -1853,7 +1866,8 @@ cdef class ParquetWriter(_Weakrefable): encryption_properties=encryption_properties, write_batch_size=write_batch_size, dictionary_pagesize_limit=dictionary_pagesize_limit, - write_page_index=write_page_index + write_page_index=write_page_index, + write_page_checksum=write_page_checksum ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py index 0609e45753f0d..a5583c9e657d2 100644 --- a/python/pyarrow/acero.py +++ b/python/pyarrow/acero.py @@ -221,7 +221,7 @@ def _perform_join(join_type, left_operand, left_keys, # Do not include right table keys. As they would lead to duplicated keys continue else: - # For all the other columns incude them as they are. + # For all the other columns include them as they are. # Just recompute the suffixes that the join produced as the projection # would lose them otherwise. if ( diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2e9750382277a..789e30d3e9b00 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -254,7 +254,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, schema_capsule, array_capsule = obj.__arrow_c_array__(requested_type) out_array = Array._import_from_c_capsule(schema_capsule, array_capsule) if type is not None and out_array.type != type: - # PyCapsule interface type coersion is best effort, so we need to + # PyCapsule interface type coercion is best effort, so we need to # check the type of the returned array and cast if necessary out_array = array.cast(type, safe=safe, memory_pool=memory_pool) return out_array @@ -1206,8 +1206,9 @@ cdef class Array(_PandasConvertible): cdef: CResult[int64_t] c_size_res - c_size_res = ReferencedBufferSize(deref(self.ap)) - size = GetResultValue(c_size_res) + with nogil: + c_size_res = ReferencedBufferSize(deref(self.ap)) + size = GetResultValue(c_size_res) return size def get_total_buffer_size(self): @@ -3415,7 +3416,7 @@ cdef class RunEndEncodedArray(Array): Find the physical offset of this REE array. This is the offset of the run that contains the value of the first - logical element of this array considering its offet. + logical element of this array considering its offset. This function uses binary-search, so it has a O(log N) cost. """ diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index adf21814a2c99..9301a5fee5ade 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -169,7 +169,7 @@ def partitioning(schema=None, field_names=None, flavor=None, Returns ------- Partitioning or PartitioningFactory - The partioning scheme + The partitioning scheme Examples -------- @@ -511,7 +511,7 @@ def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via - `pyarrrow.parquet.write_metadata`. + `pyarrow.parquet.write_metadata`. Parameters ---------- @@ -534,7 +534,7 @@ def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of - field names a DirectionaryPartitioning will be inferred. + field names a DirectoryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the @@ -630,7 +630,7 @@ def dataset(source, schema=None, format=None, filesystem=None, partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of - field names a DirectionaryPartitioning will be inferred. + field names a DirectoryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index fda9d4449763e..b0b89f8614f18 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1385,6 +1385,10 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: @staticmethod CResult[shared_ptr[COutputStream]] Open(const c_string& path) + @staticmethod + CResult[shared_ptr[COutputStream]] OpenWithAppend" Open"( + const c_string& path, c_bool append) + int file_descriptor() cdef cppclass ReadableFile(CRandomAccessFile): diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 8901d763e3998..4566cb5004add 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -178,6 +178,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: const c_string& path() const const shared_ptr[CFileSystem]& filesystem() const const shared_ptr[CBuffer]& buffer() const + const int64_t size() const # HACK: Cython can't handle all the overloads so don't declare them. # This means invalid construction of CFileSource won't be caught in # the C++ generation phase (though it will still be caught when diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index b8a3041796f97..e3179062a1e52 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -263,7 +263,7 @@ cdef extern from "arrow/python/common.h" namespace "arrow::py": cdef extern from "arrow/python/common.h" namespace "arrow::py" nogil: cdef cppclass SharedPtrNoGIL[T](shared_ptr[T]): - # This looks like the only way to satsify both Cython 2 and Cython 3 + # This looks like the only way to satisfy both Cython 2 and Cython 3 SharedPtrNoGIL& operator=(...) cdef cppclass UniquePtrNoGIL[T, DELETER=*](unique_ptr[T, DELETER]): UniquePtrNoGIL& operator=(...) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index eaf7834d5b563..e609e469b0ffa 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -372,7 +372,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: """ # In case of no missing values, we need to set ColumnNullType to # non nullable as in the current __dataframe__ protocol bit/byte masks - # can not be None + # cannot be None if self.null_count == 0: return ColumnNullType.NON_NULLABLE, None else: diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 3767b18f2a87e..fcaec41e3dcdf 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -86,20 +86,20 @@ def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table: >>> import pandas as pd >>> df = pd.DataFrame({ - ... "n_atendees": [100, 10, 1], + ... "n_attendees": [100, 10, 1], ... "country": ["Italy", "Spain", "Slovenia"], ... }) >>> df - n_atendees country - 0 100 Italy - 1 10 Spain - 2 1 Slovenia + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia >>> from_dataframe(df) pyarrow.Table - n_atendees: int64 + n_attendees: int64 country: large_string ---- - n_atendees: [[100,10,1]] + n_attendees: [[100,10,1]] country: [["Italy","Spain","Slovenia"]] """ if isinstance(df, pa.Table): diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 460e932b86273..6f3916640199a 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -111,6 +111,7 @@ cdef class NativeFile(_Weakrefable): self.is_readable = False self.is_writable = False self.is_seekable = False + self._is_appending = False def __dealloc__(self): if self.own_file: @@ -139,12 +140,15 @@ cdef class NativeFile(_Weakrefable): * rb: binary read * wb: binary write * rb+: binary read and write + * ab: binary append """ # Emulate built-in file modes if self.is_readable and self.is_writable: return 'rb+' elif self.is_readable: return 'rb' + elif self.is_writable and self._is_appending: + return 'ab' elif self.is_writable: return 'wb' else: @@ -1113,6 +1117,19 @@ cdef class OSFile(NativeFile): 'rb' b'OSFile' + Open the file to append: + + >>> with pa.OSFile('example_osfile.arrow', mode='ab') as f: + ... f.mode + ... f.write(b' is super!') + ... + 'ab' + 10 + >>> with pa.OSFile('example_osfile.arrow') as f: + ... f.read() + ... + b'OSFile is super!' + Inspect created OSFile: >>> pa.OSFile('example_osfile.arrow') @@ -1134,6 +1151,8 @@ cdef class OSFile(NativeFile): self._open_readable(c_path, maybe_unbox_memory_pool(memory_pool)) elif mode in ('w', 'wb'): self._open_writable(c_path) + elif mode in ('a', 'ab'): + self._open_writable(c_path, append=True) else: raise ValueError('Invalid file mode: {0}'.format(mode)) @@ -1146,10 +1165,13 @@ cdef class OSFile(NativeFile): self.is_readable = True self.set_random_access_file( handle) - cdef _open_writable(self, c_string path): + cdef _open_writable(self, c_string path, c_bool append=False): with nogil: - self.output_stream = GetResultValue(FileOutputStream.Open(path)) + self.output_stream = GetResultValue( + FileOutputStream.OpenWithAppend(path, append) + ) self.is_writable = True + self._is_appending = append def fileno(self): self._assert_open() @@ -1418,7 +1440,7 @@ cdef class Buffer(_Weakrefable): def __getreadbuffer__(self, Py_ssize_t idx, void **p): if idx != 0: - raise SystemError("accessing non-existent buffer segment") + raise SystemError("accessing nonexistent buffer segment") if p != NULL: p[0] = self.buffer.get().data() return self.size @@ -1427,7 +1449,7 @@ cdef class Buffer(_Weakrefable): if not self.buffer.get().is_mutable(): raise SystemError("trying to write an immutable buffer") if idx != 0: - raise SystemError("accessing non-existent buffer segment") + raise SystemError("accessing nonexistent buffer segment") if p != NULL: p[0] = self.buffer.get().data() return self.size @@ -1504,12 +1526,12 @@ cdef class BufferOutputStream(NativeFile): """ An output stream that writes to a resizable buffer. - The buffer is produced as a result when ``get.value()`` is called. + The buffer is produced as a result when ``getvalue()`` is called. Examples -------- Create an output stream, write data to it and finalize it with - ``get.value()``: + ``getvalue()``: >>> import pyarrow as pa >>> f = pa.BufferOutputStream() @@ -1609,7 +1631,7 @@ cdef class CompressedInputStream(NativeFile): Examples -------- - Create an ouput stream wich compresses the data: + Create an output stream wich compresses the data: >>> import pyarrow as pa >>> data = b"Compressed stream" @@ -1666,7 +1688,7 @@ cdef class CompressedOutputStream(NativeFile): Examples -------- - Create an ouput stream wich compresses the data: + Create an output stream wich compresses the data: >>> import pyarrow as pa >>> data = b"Compressed stream" diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 5d20a4f8b72cb..ae52f5cf34e8b 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -823,10 +823,12 @@ cdef class RecordBatchReader(_Weakrefable): Parameters ---------- - requested_schema: Schema, default None - The schema to which the stream should be casted. Currently, this is - not supported and will raise a NotImplementedError if the schema - doesn't match the current schema. + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. Returns ------- diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index ae197eca1ca6b..1440ba0750094 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -519,6 +519,7 @@ cdef class NativeFile(_Weakrefable): bint is_readable bint is_writable bint is_seekable + bint _is_appending bint own_file # By implementing these "virtual" functions (all functions in Cython diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index a0c0cabf6d312..273575b779346 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -37,7 +37,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype bint has_sparse bint _pd024 - bint _is_v1 + bint _is_v1, _is_ge_v21 def __init__(self): self._tried_importing_pandas = False @@ -74,8 +74,9 @@ cdef class _PandasAPIShim(object): "installed. Therefore, pandas-specific integration is not " "used.".format(self._version), stacklevel=2) return - elif self._loose_version < Version('2.0.0'): - self._is_v1 = True + + self._is_v1 = self._loose_version < Version('2.0.0') + self._is_ge_v21 = self._loose_version >= Version('2.1.0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -158,6 +159,10 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_v1 + def is_ge_v21(self): + self._check_import() + return self._is_ge_v21 + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index be29f68a13d5f..80e313be02dcc 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -744,9 +744,11 @@ def make_datetimetz(unit, tz): return _pandas_api.datetimetz_type(unit, tz=tz) -def table_to_blockmanager(options, table, categories=None, - ignore_metadata=False, types_mapper=None): +def table_to_dataframe( + options, table, categories=None, ignore_metadata=False, types_mapper=None +): from pandas.core.internals import BlockManager + from pandas import DataFrame all_columns = [] column_indexes = [] @@ -770,7 +772,12 @@ def table_to_blockmanager(options, table, categories=None, blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes) axes = [columns, index] - return BlockManager(blocks, axes) + mgr = BlockManager(blocks, axes) + if _pandas_api.is_ge_v21(): + df = DataFrame._from_mgr(mgr, mgr.axes) + else: + df = DataFrame(mgr) + return df # Set of the string repr of all numpy dtypes that can be stored in a pandas diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 94f086177d985..e3b927aeac166 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -289,6 +289,8 @@ class ParquetFile: If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. Examples -------- @@ -336,7 +338,8 @@ def __init__(self, source, *, metadata=None, common_metadata=None, read_dictionary=None, memory_map=False, buffer_size=0, pre_buffer=False, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None, filesystem=None): + thrift_container_size_limit=None, filesystem=None, + page_checksum_verification=False): self._close_source = getattr(source, 'closed', True) @@ -355,6 +358,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() @@ -833,7 +837,7 @@ def _sanitize_table(table, new_schema, flavor): and should be combined with a compression codec. column_encoding : string or dict, default None Specify the encoding scheme on a per column basis. - Can only be used when when ``use_dictionary`` is set to False, and + Can only be used when ``use_dictionary`` is set to False, and cannot be used in combination with ``use_byte_stream_split``. Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. @@ -896,6 +900,10 @@ def _sanitize_table(table, new_schema, flavor): filtering more efficient than the page header, as it gathers all the statistics for a Parquet file in a single place, avoiding scattered I/O. Note that the page index is not yet used on the read size by PyArrow. +write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. """ _parquet_writer_example_doc = """\ @@ -989,6 +997,7 @@ def __init__(self, where, schema, filesystem=None, dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, + write_page_checksum=False, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1046,6 +1055,7 @@ def __init__(self, where, schema, filesystem=None, dictionary_pagesize_limit=dictionary_pagesize_limit, store_schema=store_schema, write_page_index=write_page_index, + write_page_checksum=write_page_checksum, **options) self.is_open = True @@ -1775,6 +1785,8 @@ class ParquetDataset: If not None, override the maximum total size of containers allocated when decoding Thrift structures. The default limit should be sufficient for most Parquet files. +page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. Examples -------- @@ -1788,7 +1800,8 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, use_legacy_dataset=None, pre_buffer=True, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): extra_msg = "" if use_legacy_dataset is None: @@ -1821,6 +1834,7 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, metadata_nthreads=metadata_nthreads, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) warnings.warn( "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " @@ -1837,7 +1851,8 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, use_legacy_dataset=None, pre_buffer=True, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): if partitioning != "hive": raise ValueError( 'Only "hive" for hive-like partitioning is supported when ' @@ -1900,7 +1915,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, warnings.warn( "Specifying the 'schema' argument with 'use_legacy_dataset=" "True' is deprecated as of pyarrow 8.0.0. You can still " - "specify it in combination with 'use_legacy_dataet=False', " + "specify it in combination with 'use_legacy_dataset=False', " "but in that case you need to specify a pyarrow.Schema " "instead of a ParquetSchema.", FutureWarning, stacklevel=2) @@ -2428,6 +2443,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, coerce_int96_timestamp_unit=None, schema=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, + page_checksum_verification=False, **kwargs): import pyarrow.dataset as ds @@ -2446,6 +2462,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, "thrift_string_size_limit": thrift_string_size_limit, "thrift_container_size_limit": thrift_container_size_limit, + "page_checksum_verification": page_checksum_verification, } if buffer_size: read_options.update(use_buffered_stream=True, @@ -2864,6 +2881,8 @@ def partitioning(self): If not None, override the maximum total size of containers allocated when decoding Thrift structures. The default limit should be sufficient for most Parquet files. +page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. Returns ------- @@ -2958,7 +2977,8 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None): + thrift_container_size_limit=None, + page_checksum_verification=False): if not use_legacy_dataset: if metadata is not None: raise ValueError( @@ -2982,6 +3002,7 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) except ImportError: # fall back on ParquetFile for simple cases when pyarrow.dataset @@ -3013,6 +3034,7 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) return dataset.read(columns=columns, use_threads=use_threads, @@ -3029,6 +3051,11 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, "The 'ignore_prefixes' keyword is only supported when " "use_legacy_dataset=False") + if page_checksum_verification: + raise ValueError( + "The 'page_checksum_verification' keyword is only supported when " + "use_legacy_dataset=False") + if schema is not None: raise ValueError( "The 'schema' argument is only supported when " @@ -3110,6 +3137,7 @@ def write_table(table, where, row_group_size=None, version='2.6', dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, + write_page_checksum=False, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -3138,6 +3166,7 @@ def write_table(table, where, row_group_size=None, version='2.6', dictionary_pagesize_limit=dictionary_pagesize_limit, store_schema=store_schema, write_page_index=write_page_index, + write_page_checksum=write_page_checksum, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: @@ -3252,13 +3281,13 @@ def write_to_dataset(table, root_path, partition_cols=None, passed, the filename will consist of a uuid. This option is only supported for use_legacy_dataset=True. When use_legacy_dataset=None and this option is specified, - use_legacy_datase will be set to True. + use_legacy_dataset will be set to True. filesystem : FileSystem, default None If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. use_legacy_dataset : bool - Default is False. Set to True to use the the legacy behaviour + Default is False. Set to True to use the legacy behaviour (this option is deprecated, and the legacy implementation will be removed in a future version). The legacy implementation still supports the `partition_filename_cb` keyword but is less efficient @@ -3366,7 +3395,7 @@ def file_visitor(written_file): else: use_legacy_dataset = False - # Check for conflicting kewords + # Check for conflicting keywords msg_confl_0 = ( "The '{0}' argument is not supported by use_legacy_dataset={2}. " "Use only '{1}' instead." diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 8ed5d4e216e8e..e979342b886da 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1350,6 +1350,8 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) || diff --git a/python/pyarrow/src/arrow/python/datetime.h b/python/pyarrow/src/arrow/python/datetime.h index 327a61f3deb1e..7346d6bc67791 100644 --- a/python/pyarrow/src/arrow/python/datetime.h +++ b/python/pyarrow/src/arrow/python/datetime.h @@ -220,7 +220,7 @@ ARROW_PYTHON_EXPORT Result MonthDayNanoIntervalArrayToPyList( const MonthDayNanoIntervalArray& array); -/// \brief Convert the Scalar obect to a pyarrow.MonthDayNano (or None if +/// \brief Convert the Scalar object to a pyarrow.MonthDayNano (or None if /// is isn't valid). ARROW_PYTHON_EXPORT Result MonthDayNanoIntervalScalarToPyObject( diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 3407b32720d1a..9537aec574470 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -623,7 +623,7 @@ class TypeInferrer { // XXX(wesm): In ARROW-4324 I added accounting to check whether // all of the non-null values have NumPy dtypes, but the - // total_count not not being properly incremented here + // total_count not being properly incremented here ++(*list_inferrer_).total_count_; return list_inferrer_->VisitDType(dtype, keep_going); } diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e47492499867a..23b92598e321e 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -121,7 +121,7 @@ const MonthDayNanoAttrData MonthDayNanoTraits:: {"minutes", /*minutes_in_hours=*/60}, {"seconds", /*seconds_in_minute=*/60}, {"milliseconds", /*milliseconds_in_seconds*/ 1000}, - {"microseconds", /*microseconds_in_millseconds=*/1000}, + {"microseconds", /*microseconds_in_milliseconds=*/1000}, {"nanoseconds", /*nanoseconds_in_microseconds=*/1000}, {nullptr, 0}}; @@ -481,7 +481,7 @@ class PyValue { // The binary-like intermediate representation is PyBytesView because it keeps temporary // python objects alive (non-contiguous memoryview) and stores whether the original - // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // object was unicode encoded or not, which is used for unicode -> bytes coercion if // there is a non-unicode object observed. static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { @@ -819,7 +819,7 @@ class PyListConverter : public ListConverter { protected: Status ValidateBuilder(const MapType*) { if (this->list_builder_->key_builder()->null_count() > 0) { - return Status::Invalid("Invalid Map: key field can not contain null values"); + return Status::Invalid("Invalid Map: key field cannot contain null values"); } else { return Status::OK(); } diff --git a/python/pyarrow/src/arrow/python/udf.cc b/python/pyarrow/src/arrow/python/udf.cc index f7761a9277f0e..e9b72a2592738 100644 --- a/python/pyarrow/src/arrow/python/udf.cc +++ b/python/pyarrow/src/arrow/python/udf.cc @@ -275,7 +275,7 @@ struct PythonUdfHashAggregatorImpl : public HashUdfAggregator { } } - // same as ApplyGrouping in parition.cc + // same as ApplyGrouping in partition.cc // replicated the code here to avoid complicating the dependencies static Result ApplyGroupings( const ListArray& groupings, const std::shared_ptr& batch) { @@ -600,7 +600,7 @@ Status RegisterScalarAggregateFunction(PyObject* function, UdfWrapperCallback cb /// \param options User provided udf options UdfOptions AdjustForHashAggregate(const UdfOptions& options) { UdfOptions hash_options; - // Append hash_ before the function name to seperate from the scalar + // Append hash_ before the function name to separate from the scalar // version hash_options.func_name = "hash_" + options.func_name; // Extend input types with group id. Group id is appended by the group diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index e55a0d1dd54cb..2f8d1abd1f085 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -248,8 +248,9 @@ cdef class ChunkedArray(_PandasConvertible): cdef: CResult[int64_t] c_res_buffer - c_res_buffer = ReferencedBufferSize(deref(self.chunked_array)) - size = GetResultValue(c_res_buffer) + with nogil: + c_res_buffer = ReferencedBufferSize(deref(self.chunked_array)) + size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -449,7 +450,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> import pyarrow as pa >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) >>> animals = pa.chunked_array(( - ... ["Flamingo", "Parot", "Dog"], + ... ["Flamingo", "Parrot", "Dog"], ... ["Horse", "Brittle stars", "Centipede"] ... )) >>> n_legs.equals(n_legs) @@ -584,7 +585,7 @@ cdef class ChunkedArray(_PandasConvertible): -------- >>> import pyarrow as pa >>> animals = pa.chunked_array(( - ... ["Flamingo", "Parot", "Dog"], + ... ["Flamingo", "Parrot", "Dog"], ... ["Horse", "Brittle stars", "Centipede"] ... )) >>> animals.dictionary_encode() @@ -594,7 +595,7 @@ cdef class ChunkedArray(_PandasConvertible): -- dictionary: [ "Flamingo", - "Parot", + "Parrot", "Dog", "Horse", "Brittle stars", @@ -610,7 +611,7 @@ cdef class ChunkedArray(_PandasConvertible): -- dictionary: [ "Flamingo", - "Parot", + "Parrot", "Dog", "Horse", "Brittle stars", @@ -1127,7 +1128,7 @@ cdef class ChunkedArray(_PandasConvertible): Examples -------- >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parot", "Dog"]).dictionary_encode() + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() >>> c_arr = pa.chunked_array([arr_1, arr_2]) >>> c_arr @@ -1137,7 +1138,7 @@ cdef class ChunkedArray(_PandasConvertible): -- dictionary: [ "Flamingo", - "Parot", + "Parrot", "Dog" ] -- indices: @@ -1167,7 +1168,7 @@ cdef class ChunkedArray(_PandasConvertible): -- dictionary: [ "Flamingo", - "Parot", + "Parrot", "Dog", "Horse", "Brittle stars", @@ -1183,7 +1184,7 @@ cdef class ChunkedArray(_PandasConvertible): -- dictionary: [ "Flamingo", - "Parot", + "Parrot", "Dog", "Horse", "Brittle stars", @@ -2386,8 +2387,9 @@ cdef class RecordBatch(_Tabular): cdef: CResult[int64_t] c_res_buffer - c_res_buffer = ReferencedBufferSize(deref(self.batch)) - size = GetResultValue(c_res_buffer) + with nogil: + c_res_buffer = ReferencedBufferSize(deref(self.batch)) + size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -2804,7 +2806,7 @@ cdef class RecordBatch(_Tabular): >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) >>> names = ["n_legs", "animals"] - Construct a RecordBartch from pyarrow Arrays using names: + Construct a RecordBatch from pyarrow Arrays using names: >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) pyarrow.RecordBatch @@ -2822,7 +2824,7 @@ cdef class RecordBatch(_Tabular): 4 5 Brittle stars 5 100 Centipede - Construct a RecordBartch from pyarrow Arrays using schema: + Construct a RecordBatch from pyarrow Arrays using schema: >>> my_schema = pa.schema([ ... pa.field('n_legs', pa.int64()), @@ -3039,9 +3041,12 @@ cdef class RecordBatch(_Tabular): Parameters ---------- - requested_schema : pyarrow.lib.Schema, default None - A schema to attempt to cast the streamed data to. This is currently - unsupported and will raise an error. + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. Returns ------- @@ -3656,7 +3661,7 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> arr_1 = pa.array(["Flamingo", "Parot", "Dog"]).dictionary_encode() + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() >>> c_arr = pa.chunked_array([arr_1, arr_2]) >>> table = pa.table([c_arr], names=["animals"]) @@ -3665,7 +3670,7 @@ cdef class Table(_Tabular): animals: dictionary ---- animals: [ -- dictionary: - ["Flamingo","Parot","Dog"] -- indices: + ["Flamingo","Parrot","Dog"] -- indices: [0,1,2], -- dictionary: ["Horse","Brittle stars","Centipede"] -- indices: [0,1,2]] @@ -3677,9 +3682,9 @@ cdef class Table(_Tabular): animals: dictionary ---- animals: [ -- dictionary: - ["Flamingo","Parot","Dog","Horse","Brittle stars","Centipede"] -- indices: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: [0,1,2], -- dictionary: - ["Flamingo","Parot","Dog","Horse","Brittle stars","Centipede"] -- indices: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: [3,4,5]] """ cdef: @@ -4188,12 +4193,12 @@ cdef class Table(_Tabular): def _to_pandas(self, options, categories=None, ignore_metadata=False, types_mapper=None): - from pyarrow.pandas_compat import table_to_blockmanager - mgr = table_to_blockmanager( + from pyarrow.pandas_compat import table_to_dataframe + df = table_to_dataframe( options, self, categories, ignore_metadata=ignore_metadata, types_mapper=types_mapper) - return pandas_api.data_frame(mgr) + return df @property def schema(self): @@ -4334,8 +4339,9 @@ cdef class Table(_Tabular): cdef: CResult[int64_t] c_res_buffer - c_res_buffer = ReferencedBufferSize(deref(self.table)) - size = GetResultValue(c_res_buffer) + with nogil: + c_res_buffer = ReferencedBufferSize(deref(self.table)) + size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -4859,9 +4865,12 @@ cdef class Table(_Tabular): Parameters ---------- - requested_schema : pyarrow.lib.Schema, default None - A schema to attempt to cast the streamed data to. This is currently - unsupported and will raise an error. + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. Returns ------- diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index dd12a2661656a..83e6ebeb7a1fc 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -18,6 +18,7 @@ from collections import OrderedDict import io import warnings +from shutil import copytree import numpy as np import pytest @@ -660,7 +661,7 @@ def test_write_error_deletes_incomplete_file(tempdir): @parametrize_legacy_dataset def test_read_non_existent_file(tempdir, use_legacy_dataset): - path = 'non-existent-file.parquet' + path = 'nonexistent-file.parquet' try: pq.read_table(path, use_legacy_dataset=use_legacy_dataset) except Exception as e: @@ -882,3 +883,134 @@ def test_thrift_size_limits(tempdir): assert got == table got = pq.read_table(path) assert got == table + + +def test_page_checksum_verification_write_table(tempdir): + """Check that checksum verification works for datasets created with + pq.write_table()""" + + # Write some sample data into a parquet file with page checksum enabled + original_path = tempdir / 'correct.parquet' + table_orig = pa.table({'a': [1, 2, 3, 4]}) + pq.write_table(table_orig, original_path, write_page_checksum=True) + + # Read file and verify that the data is correct + table_check = pq.read_table(original_path, page_checksum_verification=True) + assert table_orig == table_check + + # Read the original file as binary and swap the 31-th and 36-th bytes. This + # should be equivalent to storing the following data: + # pa.table({'a': [1, 3, 2, 4]}) + bin_data = bytearray(original_path.read_bytes()) + + # Swap two bytes to emulate corruption. Also, check that the two bytes are + # different, otherwise no corruption occurs + assert bin_data[31] != bin_data[36] + bin_data[31], bin_data[36] = bin_data[36], bin_data[31] + + # Write the corrupted data to another parquet file + corrupted_path = tempdir / 'corrupted.parquet' + corrupted_path.write_bytes(bin_data) + + # Case 1: Reading the corrupted file with read_table() and without page + # checksum verification succeeds but yields corrupted data + table_corrupt = pq.read_table(corrupted_path, + page_checksum_verification=False) + # The read should complete without error, but the table has different + # content than the original file! + assert table_corrupt != table_orig + assert table_corrupt == pa.table({'a': [1, 3, 2, 4]}) + + # Case 2: Reading the corrupted file with read_table() and with page + # checksum verification enabled raises an exception + with pytest.raises(OSError, match="CRC checksum verification"): + _ = pq.read_table(corrupted_path, page_checksum_verification=True) + + # Case 3: Reading the corrupted file with ParquetFile.read() and without + # page checksum verification succeeds but yields corrupted data + corrupted_pq_file = pq.ParquetFile(corrupted_path, + page_checksum_verification=False) + table_corrupt2 = corrupted_pq_file.read() + assert table_corrupt2 != table_orig + assert table_corrupt2 == pa.table({'a': [1, 3, 2, 4]}) + + # Case 4: Reading the corrupted file with ParquetFile.read() and with page + # checksum verification enabled raises an exception + corrupted_pq_file = pq.ParquetFile(corrupted_path, + page_checksum_verification=True) + # Accessing the data should result in an error + with pytest.raises(OSError, match="CRC checksum verification"): + _ = corrupted_pq_file.read() + + # Case 5: Check that enabling page checksum verification in combination + # with legacy dataset raises an exception + with pytest.raises(ValueError, match="page_checksum_verification"): + _ = pq.read_table(corrupted_path, + page_checksum_verification=True, + use_legacy_dataset=True) + + +@pytest.mark.dataset +@pytest.mark.parametrize( + "use_legacy_dataset", + [ + False, + pytest.param( + True, + marks=pytest.mark.filterwarnings( + "ignore:Passing 'use_legacy_dataset=True':FutureWarning" + ), + ), + ], +) +def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): + """Check that checksum verification works for datasets created with + pq.write_to_dataset""" + + table_orig = pa.table({'a': [1, 2, 3, 4]}) + + # Write a sample dataset with page checksum enabled + original_dir_path = tempdir / 'correct_dir' + pq.write_to_dataset(table_orig, + original_dir_path, + write_page_checksum=True, + use_legacy_dataset=use_legacy_dataset) + + # Read file and verify that the data is correct + original_file_path_list = list(original_dir_path.iterdir()) + assert len(original_file_path_list) == 1 + original_path = original_file_path_list[0] + table_check = pq.read_table(original_path, page_checksum_verification=True) + assert table_orig == table_check + + # Read the original file as binary and swap the 31-th and 36-th bytes. This + # should be equivalent to storing the following data: + # pa.table({'a': [1, 3, 2, 4]}) + bin_data = bytearray(original_path.read_bytes()) + + # Swap two bytes to emulate corruption. Also, check that the two bytes are + # different, otherwise no corruption occurs + assert bin_data[31] != bin_data[36] + bin_data[31], bin_data[36] = bin_data[36], bin_data[31] + + # Write the corrupted data to another parquet dataset + # Copy dataset dir (which should be just one file) + corrupted_dir_path = tempdir / 'corrupted_dir' + copytree(original_dir_path, corrupted_dir_path) + # Corrupt just the one file with the dataset + corrupted_file_path = corrupted_dir_path / original_path.name + corrupted_file_path.write_bytes(bin_data) + + # Case 1: Reading the corrupted file with read_table() and without page + # checksum verification succeeds but yields corrupted data + table_corrupt = pq.read_table(corrupted_file_path, + page_checksum_verification=False) + # The read should complete without error, but the table has different + # content than the original file! + assert table_corrupt != table_orig + assert table_corrupt == pa.table({'a': [1, 3, 2, 4]}) + + # Case 2: Reading the corrupted file with read_table() and with page + # checksum verification enabled raises an exception + with pytest.raises(OSError, match="CRC checksum verification"): + _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index be27c71b813e4..a9e99d5d65cf9 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -1622,7 +1622,7 @@ def test_read_table_schema(tempdir): expected = pa.table({'a': [1, 2, 3]}, schema=schema) assert result.equals(expected) - # reading multiple fiels + # reading multiple fields result = pq.read_table(tempdir, schema=schema) expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.equals(expected) @@ -1796,7 +1796,7 @@ def test_parquet_write_to_dataset_deprecated_properties(tempdir): @pytest.mark.dataset -def test_parquet_write_to_dataset_unsupported_keywards_in_legacy(tempdir): +def test_parquet_write_to_dataset_unsupported_keywords_in_legacy(tempdir): table = pa.table({'a': [1, 2, 3]}) path = tempdir / 'data.parquet' diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 5e6895c8dc24c..b902541015aa2 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -94,14 +94,14 @@ def test_validate_schema_write_table(tempdir): w.write_table(simple_table) -def test_parquet_invalid_writer(): +def test_parquet_invalid_writer(tempdir): # avoid segfaults with invalid construction with pytest.raises(TypeError): some_schema = pa.schema([pa.field("x", pa.int32())]) pq.ParquetWriter(None, some_schema) with pytest.raises(TypeError): - pq.ParquetWriter("some_path", None) + pq.ParquetWriter(tempdir / "some_path", None) @pytest.mark.pandas diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index 988e9b6e3146c..a43606013027c 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -265,7 +265,7 @@ def test_order_by(): expected = pa.table({"a": [3, 2, 4, 1], "b": [None, 3, 2, 1]}) assert result.equals(expected) - # emtpy ordering + # empty ordering ord_opts = OrderByNodeOptions([]) decl = Declaration.from_sequence([table_source, Declaration("order_by", ord_opts)]) with pytest.raises( diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 2f9727922b49a..599d15d023a55 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -989,7 +989,7 @@ def test_list_array_types_from_arrays_fail(list_array_type, list_type_factory): reconstructed_arr = list_array_type.from_arrays(arr.offsets, arr.values) assert reconstructed_arr.to_pylist() == [[0], [], [0, None], [0]] - # Manually specifiying offsets (with nulls) is same as mask at top level + # Manually specifying offsets (with nulls) is same as mask at top level reconstructed_arr = list_array_type.from_arrays(offsets, arr.values) assert arr == reconstructed_arr reconstructed_arr = list_array_type.from_arrays(arr.offsets, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 4b2144d702ca5..067d96a82113f 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -2385,7 +2385,7 @@ def _check_temporal_rounding(ts, values, unit): # Check rounding with calendar_based_origin=True. # Note: rounding to month is not supported in Pandas so we can't - # approximate this functionallity and exclude unit == "day". + # approximate this functionality and exclude unit == "day". if unit != "day": options = pc.RoundTemporalOptions( value, unit, calendar_based_origin=True) @@ -3501,7 +3501,7 @@ def test_expression_call_function(): assert str(pc.add(field, 1)) == "add(field, 1)" assert str(pc.add(field, pa.scalar(1))) == "add(field, 1)" - # Invalid pc.scalar input gives original erorr message + # Invalid pc.scalar input gives original error message msg = "only other expressions allowed as arguments" with pytest.raises(TypeError, match=msg): pc.add(field, object) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 6f3b54b0cd681..f3c25ee8c5c3b 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -25,6 +25,7 @@ import tempfile import threading import time +from shutil import copytree from urllib.parse import quote @@ -788,12 +789,15 @@ def test_parquet_scan_options(): opts5 = ds.ParquetFragmentScanOptions( thrift_string_size_limit=123456, thrift_container_size_limit=987654,) + opts6 = ds.ParquetFragmentScanOptions( + page_checksum_verification=True) assert opts1.use_buffered_stream is False assert opts1.buffer_size == 2**13 assert opts1.pre_buffer is True assert opts1.thrift_string_size_limit == 100_000_000 # default in C++ assert opts1.thrift_container_size_limit == 1_000_000 # default in C++ + assert opts1.page_checksum_verification is False assert opts2.use_buffered_stream is False assert opts2.buffer_size == 2**12 @@ -810,11 +814,14 @@ def test_parquet_scan_options(): assert opts5.thrift_string_size_limit == 123456 assert opts5.thrift_container_size_limit == 987654 + assert opts6.page_checksum_verification is True + assert opts1 == opts1 assert opts1 != opts2 assert opts2 != opts3 assert opts3 != opts4 assert opts5 != opts1 + assert opts6 != opts1 def test_file_format_pickling(pickle_module): @@ -981,6 +988,64 @@ def test_make_fragment(multisourcefs): assert row_group_fragment.row_groups == [0] +@pytest.mark.parquet +@pytest.mark.s3 +def test_make_fragment_with_size(s3_example_simple): + """ + Test passing file_size to make_fragment. Not all FS implementations make use + of the file size (by implementing an OpenInputFile that takes a FileInfo), but + s3 does, which is why it's used here. + """ + table, path, fs, uri, host, port, access_key, secret_key = s3_example_simple + + file_format = ds.ParquetFileFormat() + paths = [path] + + fragments = [file_format.make_fragment(path, fs) + for path in paths] + dataset = ds.FileSystemDataset( + fragments, format=file_format, schema=table.schema, filesystem=fs + ) + + tbl = dataset.to_table() + assert tbl.equals(table) + + # true sizes -> works + sizes_true = [dataset.filesystem.get_file_info(x).size for x in dataset.files] + fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) + for path, size in zip(paths, sizes_true)] + dataset_with_size = ds.FileSystemDataset( + fragments_with_size, format=file_format, schema=table.schema, filesystem=fs + ) + tbl = dataset.to_table() + assert tbl.equals(table) + + # too small sizes -> error + sizes_toosmall = [1 for path in paths] + fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) + for path, size in zip(paths, sizes_toosmall)] + + dataset_with_size = ds.FileSystemDataset( + fragments_with_size, format=file_format, schema=table.schema, filesystem=fs + ) + + with pytest.raises(pyarrow.lib.ArrowInvalid, match='Parquet file size is 1 bytes'): + table = dataset_with_size.to_table() + + # too large sizes -> error + sizes_toolarge = [1000000 for path in paths] + fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) + for path, size in zip(paths, sizes_toolarge)] + + dataset_with_size = ds.FileSystemDataset( + fragments_with_size, format=file_format, schema=table.schema, filesystem=fs + ) + + # invalid range + with pytest.raises(OSError, match='HTTP status 416'): + table = dataset_with_size.to_table() + + def test_make_csv_fragment_from_buffer(dataset_reader, pickle_module): content = textwrap.dedent(""" alpha,num,animal @@ -2242,7 +2307,7 @@ def test_construct_from_list_of_files(tempdir, dataset_reader): @pytest.mark.parquet def test_construct_from_list_of_mixed_paths_fails(mockfs): - # isntantiate from a list of mixed paths + # instantiate from a list of mixed paths files = [ 'subdir/1/xxx/file0.parquet', 'subdir/1/xxx/doesnt-exist.parquet', @@ -2253,7 +2318,7 @@ def test_construct_from_list_of_mixed_paths_fails(mockfs): @pytest.mark.parquet def test_construct_from_mixed_child_datasets(mockfs): - # isntantiate from a list of mixed paths + # instantiate from a list of mixed paths a = ds.dataset(['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'], filesystem=mockfs) b = ds.dataset('subdir', filesystem=mockfs) @@ -5376,3 +5441,76 @@ def test_dataset_sort_by(tempdir, dstype): sorted_tab_dict = sorted_tab.to_table().to_pydict() assert sorted_tab_dict["a"] == [5, 7, 7, 35] assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"] + + +def test_checksum_write_dataset_read_dataset_to_table(tempdir): + """Check that checksum verification works for datasets created with + ds.write_dataset and read with ds.dataset.to_table""" + + table_orig = pa.table({'a': [1, 2, 3, 4]}) + + # Write a sample dataset with page checksum enabled + pq_write_format = pa.dataset.ParquetFileFormat() + write_options = pq_write_format.make_write_options( + write_page_checksum=True) + + original_dir_path = tempdir / 'correct_dir' + ds.write_dataset( + data=table_orig, + base_dir=original_dir_path, + format=pq_write_format, + file_options=write_options, + ) + + # Open dataset and verify that the data is correct + pq_scan_opts_crc = ds.ParquetFragmentScanOptions( + page_checksum_verification=True) + pq_read_format_crc = pa.dataset.ParquetFileFormat( + default_fragment_scan_options=pq_scan_opts_crc) + table_check = ds.dataset( + original_dir_path, + format=pq_read_format_crc + ).to_table() + assert table_orig == table_check + + # Copy dataset dir (which should be just one file) + corrupted_dir_path = tempdir / 'corrupted_dir' + copytree(original_dir_path, corrupted_dir_path) + + # Read the only file in the path as binary and swap the 31-th and 36-th + # bytes. This should be equivalent to storing the following data: + # pa.table({'a': [1, 3, 2, 4]}) + corrupted_file_path_list = list(corrupted_dir_path.iterdir()) + assert len(corrupted_file_path_list) == 1 + corrupted_file_path = corrupted_file_path_list[0] + bin_data = bytearray(corrupted_file_path.read_bytes()) + + # Swap two bytes to emulate corruption. Also, check that the two bytes are + # different, otherwise no corruption occurs + assert bin_data[31] != bin_data[36] + bin_data[31], bin_data[36] = bin_data[36], bin_data[31] + + # Write the corrupted data to the parquet file + corrupted_file_path.write_bytes(bin_data) + + # Case 1: Reading the corrupted file with dataset().to_table() and without + # page checksum verification succeeds but yields corrupted data + pq_scan_opts_no_crc = ds.ParquetFragmentScanOptions( + page_checksum_verification=False) + pq_read_format_no_crc = pa.dataset.ParquetFileFormat( + default_fragment_scan_options=pq_scan_opts_no_crc) + table_corrupt = ds.dataset( + corrupted_dir_path, format=pq_read_format_no_crc).to_table() + + # The read should complete without error, but the table has different + # content than the original file! + assert table_corrupt != table_orig + assert table_corrupt == pa.table({'a': [1, 3, 2, 4]}) + + # Case 2: Reading the corrupted file with read_table() and with page + # checksum verification enabled raises an exception + with pytest.raises(OSError, match="CRC checksum verification"): + _ = ds.dataset( + corrupted_dir_path, + format=pq_read_format_crc + ).to_table() diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index b5d6f510dbc8d..d25b22990abfb 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -123,7 +123,7 @@ def test_dataset_encryption_decryption(): filesystem=mockfs, ) - # read without descryption config -> should error is dataset was properly encrypted + # read without decryption config -> should error is dataset was properly encrypted pformat = pa.dataset.ParquetFileFormat() with pytest.raises(IOError, match=r"no decryption"): ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index bf15ad0bc4d65..9553dc2507225 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -577,7 +577,7 @@ def is_valid(self, token): def case_insensitive_header_lookup(headers, lookup_key): """Lookup the value of given key in the given headers. - The key lookup is case insensitive. + The key lookup is case-insensitive. """ for key in headers: if key.lower() == lookup_key.lower(): diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index c540bf96818d4..59c9c449429b3 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -760,6 +760,38 @@ def test_delete_dir(fs, pathfn): fs.delete_dir(d) +def test_delete_dir_with_explicit_subdir(fs, pathfn): + # GH-38618: regression with AWS failing to delete directories, + # depending on whether they were created explicitly. Note that + # Minio doesn't reproduce the issue, so this test is not a regression + # test in itself. + skip_fsspec_s3fs(fs) + + d = pathfn('directory/') + nd = pathfn('directory/nested/') + + # deleting dir with explicit subdir + fs.create_dir(d) + fs.create_dir(nd) + fs.delete_dir(d) + dir_info = fs.get_file_info(d) + assert dir_info.type == FileType.NotFound + + # deleting dir with blob in explicit subdir + d = pathfn('directory2') + nd = pathfn('directory2/nested') + f = pathfn('directory2/nested/target-file') + + fs.create_dir(d) + fs.create_dir(nd) + with fs.open_output_stream(f) as s: + s.write(b'data') + + fs.delete_dir(d) + dir_info = fs.get_file_info(d) + assert dir_info.type == FileType.NotFound + + def test_delete_dir_contents(fs, pathfn): skip_fsspec_s3fs(fs) @@ -1303,12 +1335,12 @@ def test_s3_proxy_options(monkeypatch, pickle_module): # Missing port with pytest.raises(KeyError): S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost'}) - # Invalid proxy URI (invalid scheme htttps) + # Invalid proxy URI (invalid scheme httpsB) with pytest.raises(pa.ArrowInvalid): - S3FileSystem(proxy_options='htttps://localhost:9000') - # Invalid proxy_options dict (invalid scheme htttps) + S3FileSystem(proxy_options='httpsB://localhost:9000') + # Invalid proxy_options dict (invalid scheme httpA) with pytest.raises(pa.ArrowInvalid): - S3FileSystem(proxy_options={'scheme': 'htttp', 'host': 'localhost', + S3FileSystem(proxy_options={'scheme': 'httpA', 'host': 'localhost', 'port': 8999}) @@ -1690,11 +1722,11 @@ def test_s3_real_aws_region_selection(): assert fs.region == 'us-east-2' # Reading from the wrong region may still work for public buckets... - # Non-existent bucket (hopefully, otherwise need to fix this test) + # Nonexistent bucket (hopefully, otherwise need to fix this test) with pytest.raises(IOError, match="Bucket '.*' not found"): - FileSystem.from_uri('s3://x-arrow-non-existent-bucket') + FileSystem.from_uri('s3://x-arrow-nonexistent-bucket') fs, path = FileSystem.from_uri( - 's3://x-arrow-non-existent-bucket?region=us-east-3') + 's3://x-arrow-nonexistent-bucket?region=us-east-3') assert fs.region == 'us-east-3' diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 0c9e591ccd466..071962af290fc 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -229,7 +229,7 @@ def read_buffer(self, nbytes): buf = f.read_buffer(length) assert len(buf) == length assert memoryview(buf).tobytes() == dst_buf[:length] - # buf should point to the same memory, so modyfing it + # buf should point to the same memory, so modifying it memoryview(buf)[0] = ord(b'x') # should modify the original assert dst_buf[0] == ord(b'x') @@ -1114,6 +1114,13 @@ def test_os_file_writer(tmpdir): with pytest.raises(IOError): f2.read(5) + f2.close() + + # Append + with pa.OSFile(path, mode='ab') as f4: + f4.write(b'bar') + with pa.OSFile(path) as f5: + assert f5.size() == 6 # foo + bar def test_native_file_write_reject_unicode(): @@ -1152,6 +1159,18 @@ def test_native_file_modes(tmpdir): assert f.writable() assert not f.seekable() + with pa.OSFile(path, mode='ab') as f: + assert f.mode == 'ab' + assert not f.readable() + assert f.writable() + assert not f.seekable() + + with pa.OSFile(path, mode='a') as f: + assert f.mode == 'ab' + assert not f.readable() + assert f.writable() + assert not f.seekable() + with open(path, 'wb') as f: f.write(b'foooo') diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index be83f891a2fb4..a0a6174266310 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -226,7 +226,7 @@ def test_empty_rows(self): assert table.num_columns == 0 assert table.num_rows == 2 - def test_reconcile_accross_blocks(self): + def test_reconcile_across_blocks(self): # ARROW-12065: reconciling inferred types across blocks first_row = b'{ }\n' read_options = ReadOptions(block_size=len(first_row)) @@ -304,6 +304,14 @@ def test_small_random_json(self): assert table.equals(expected) assert table.to_pydict() == expected.to_pydict() + def test_load_large_json(self): + data, expected = make_random_json(num_cols=2, num_rows=100100) + # set block size is 10MB + read_options = ReadOptions(block_size=1024*1024*10) + table = self.read_bytes(data, read_options=read_options) + assert table.num_rows == 100100 + assert expected.num_rows == 100100 + def test_stress_block_sizes(self): # Test a number of small block sizes to stress block stitching data_base, expected = make_random_json(num_cols=2, num_rows=100) diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index a48ac0c3cd81a..8b8c50882b749 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -57,7 +57,7 @@ def test_io_thread_count(): def test_env_var_io_thread_count(): - # Test that the number of IO threads can be overriden with the + # Test that the number of IO threads can be overridden with the # ARROW_IO_THREADS environment variable. code = """if 1: import pyarrow as pa diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 10eb931592093..342beaaeb5a98 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1343,7 +1343,7 @@ def test_date_objects_typed(self): ex_values[1] = pd.NaT.value # date32 and date64 convert to [ms] in pandas v2, but - # in pandas v1 they are siliently coerced to [ns] + # in pandas v1 they are silently coerced to [ns] ex_datetime64ms = ex_values.astype('datetime64[ms]') expected_pandas = pd.DataFrame({'date32': ex_datetime64ms, 'date64': ex_datetime64ms}, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index d7585d1415479..74dee59558239 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -633,7 +633,7 @@ def test_struct(): assert s['y'].as_py() == 3.5 with pytest.raises(KeyError): - s['non-existent'] + s['nonexistent'] s = pa.scalar(None, type=ty) assert list(s) == list(s.keys()) == ['x', 'y'] diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 5dda2cfcf09e7..d4fbfb7406838 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -182,7 +182,7 @@ def has_function(fns, ext_file, fn_name): def test_get_supported_functions(): supported_functions = pa._substrait.get_supported_functions() - # It probably doesn't make sense to exhaustively verfiy this list but + # It probably doesn't make sense to exhaustively verify this list but # we can check a sample aggregate and a sample non-aggregate entry assert has_function(supported_functions, 'functions_arithmetic.yaml', 'add') diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 16343eae61245..7600f1dd33226 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -1019,7 +1019,7 @@ def test_key_value_metadata(): assert md['b'] == b'beta' assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA'] assert md.get_all('b') == [b'beta'] - assert md.get_all('unkown') == [] + assert md.get_all('unknown') == [] with pytest.raises(KeyError): md = pa.KeyValueMetadata([ diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 62d1eb5bafd4f..c8e376fefb3b8 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -26,7 +26,7 @@ # UDFs are all tested with a dataset scan pytestmark = pytest.mark.dataset -# For convience, most of the test here doesn't care about udf func docs +# For convenience, most of the test here doesn't care about udf func docs empty_udf_doc = {"summary": "", "description": ""} try: @@ -302,7 +302,7 @@ def raising_func(ctx): @pytest.fixture(scope="session") def unary_vector_func_fixture(): """ - Reigster a vector function + Register a vector function """ def pct_rank(ctx, x): # copy here to get around pandas 1.0 issue @@ -319,7 +319,7 @@ def pct_rank(ctx, x): @pytest.fixture(scope="session") def struct_vector_func_fixture(): """ - Reigster a vector function that returns a struct array + Register a vector function that returns a struct array """ def pivot(ctx, k, v, c): df = pa.RecordBatch.from_arrays([k, v, c], names=['k', 'v', 'c']).to_pandas() @@ -486,7 +486,7 @@ def add_const(ctx, scalar): func_doc, in_types, out_type) - # doc with no decription + # doc with no description func_doc = { "summary": "test summary" } diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 4f178aefc5e3d..bb693cd6637d1 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -42,7 +42,7 @@ def doc(*docstrings, **params): If the docstring is a template, it will be saved as a string. Otherwise, it will be saved as a callable and the docstring will be obtained via the __doc__ attribute. - This decorator can not be used on Cython classes due to a CPython constraint, + This decorator cannot be used on Cython classes due to a CPython constraint, which enforces the __doc__ attribute to be read-only. See https://github.com/python/cpython/issues/91309 diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 3a36a808ab1d0..1bf25e57a3cce 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 14.0.0.9000 +Version: 14.0.1.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index e00c6b51b597d..8515facdff871 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,33 @@ under the License. --> -# arrow 14.0.0.9000 +# arrow 14.0.1.9000 + +# arrow 14.0.0.2 + +## Minor improvements and fixes + +* Fixed the printf syntax to align with format checking (#38894) +* Removed bashism in configure script (#38716). +* Fixed a broken link in the README (#38657) +* Properly escape the license header in the lintr config (#38639). +* Removed spurious warnings from installation-script test suite (#38571). +* Polished installation-script after refactor (#38534) + +## Installation + +* If pkg-config fails to detect the required libraries an additional search + without pkg-config is run (#38970). +* Fetch the latest nightly Arrow C++ binary when installing a development + Version (#38236). + +# arrow 14.0.0.1 + +## Minor improvements and fixes + +* Add more debug output for build failures (#38819) +* Increase timeout during static library download (#38767) +* Fix bug where rosetta detection was causing installation failure (#38754) # arrow 14.0.0 @@ -54,10 +80,10 @@ ## Installation -* MacOS builds now use the same installation pathway as on Linux (@assignUser, +* macOS builds now use the same installation pathway as on Linux (@assignUser, #37684). * A warning message is now issued on package load when running under emulation - on MacOS (i.e., use of x86 installation of R on M1/aarch64; #37777). + on macOS (i.e., use of x86 installation of R on M1/aarch64; #37777). * R scripts that run during configuration and installation are now run using the correct R interpreter (@meztez, #37225). * Failed libarrow builds now return more detailed output (@amoeba, #37727). @@ -390,7 +416,7 @@ As of version 10.0.0, `arrow` requires C++17 to build. This means that: * The `arrow.dev_repo` for nightly builds of the R package and prebuilt libarrow binaries is now . -* Brotli and BZ2 are shipped with MacOS binaries. BZ2 is shipped with Windows binaries. (#13484) +* Brotli and BZ2 are shipped with macOS binaries. BZ2 is shipped with Windows binaries. (#13484) # arrow 8.0.0 @@ -523,7 +549,7 @@ Arrow arrays and tables can be easily concatenated: ## Other improvements and fixes * Many of the vignettes have been reorganized, restructured and expanded to improve their usefulness and clarity. -* Code to generate schemas (and individual data type specficiations) are accessible with the `$code()` method on a `schema` or `type`. This allows you to easily get the code needed to create a schema from an object that already has one. +* Code to generate schemas (and individual data type specifications) are accessible with the `$code()` method on a `schema` or `type`. This allows you to easily get the code needed to create a schema from an object that already has one. * Arrow `Duration` type has been mapped to R's `difftime` class. * The `decimal256()` type is supported. The `decimal()` function has been revised to call either `decimal256()` or `decimal128()` based on the value of the `precision` argument. * `write_parquet()` uses a reasonable guess at `chunk_size` instead of always writing a single chunk. This improves the speed of reading and writing large Parquet files. @@ -798,7 +824,7 @@ to send and receive data. See `vignette("flight", package = "arrow")` for an ove * `arrow` now depends on [`cpp11`](https://cpp11.r-lib.org/), which brings more robust UTF-8 handling and faster compilation * The Linux build script now succeeds on older versions of R -* MacOS binary packages now ship with zstandard compression enabled +* macOS binary packages now ship with zstandard compression enabled ## Bug fixes and other enhancements diff --git a/r/R/arrow-object.R b/r/R/arrow-object.R index 5c2cf4691fc9c..b66c39dce957e 100644 --- a/r/R/arrow-object.R +++ b/r/R/arrow-object.R @@ -56,7 +56,7 @@ ArrowObject <- R6Class("ArrowObject", # Return NULL, because keeping this R6 object in scope is not a good idea. # This syntax would allow the rare use that has to actually do this to # do `object <- object$.unsafe_delete()` and reduce the chance that an - # IDE like RStudio will try try to call other methods which will error + # IDE like RStudio will try to call other methods which will error invisible(NULL) } ) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 1f39a50744abc..54e237192e080 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -183,7 +183,7 @@ configure_tzdb <- function() { # Just to be extra safe, let's wrap this in a try(); # we don't want a failed startup message to prevent the package from loading try({ - # On MacOS only, Check if we are running in under emulation, and warn this will not work + # On macOS only, Check if we are running in under emulation, and warn this will not work if (on_rosetta()) { packageStartupMessage( paste( diff --git a/r/R/compression.R b/r/R/compression.R index 8d28fbefd7b3d..3fe00a756987c 100644 --- a/r/R/compression.R +++ b/r/R/compression.R @@ -61,7 +61,7 @@ Codec$create <- function(type = "gzip", compression_level = NA) { #' the Arrow C++ library. This function lets you know which are available for #' use. #' @param type A string, one of "uncompressed", "snappy", "gzip", "brotli", -#' "zstd", "lz4", "lzo", or "bz2", case insensitive. +#' "zstd", "lz4", "lzo", or "bz2", case-insensitive. #' @return Logical: is `type` available? #' @export #' @examples diff --git a/r/R/config.R b/r/R/config.R index bd00afe1be631..941d74e59a90d 100644 --- a/r/R/config.R +++ b/r/R/config.R @@ -40,7 +40,7 @@ io_thread_count <- function() { #' @rdname io_thread_count #' @param num_threads integer: New number of threads for thread pool. At least -#' two threads are reccomended to support all operations in the arrow +#' two threads are recommended to support all operations in the arrow #' package. #' @export set_io_thread_count <- function(num_threads) { diff --git a/r/R/csv.R b/r/R/csv.R index a024c4531e748..03540006ca0a2 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -76,7 +76,7 @@ #' #' Note that if you are specifying column names, whether by `schema` or #' `col_names`, and the CSV file has a header row that would otherwise be used -#' to idenfity column names, you'll need to add `skip = 1` to skip that row. +#' to identify column names, you'll need to add `skip = 1` to skip that row. #' #' @param file A character file name or URI, literal data (either a single string or a [raw] vector), #' an Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`). diff --git a/r/R/dataset.R b/r/R/dataset.R index 682f6c1481b4f..08189f1b290a2 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -46,7 +46,7 @@ #' #' The default behavior in `open_dataset()` is to inspect the file paths #' contained in the provided directory, and if they look like Hive-style, parse -#' them as Hive. If your dataset has Hive-style partioning in the file paths, +#' them as Hive. If your dataset has Hive-style partitioning in the file paths, #' you do not need to provide anything in the `partitioning` argument to #' `open_dataset()` to use them. If you do provide a character vector of #' partition column names, they will be ignored if they match what is detected, diff --git a/r/R/dplyr-count.R b/r/R/dplyr-count.R index ee713030b262e..df585a6cf0111 100644 --- a/r/R/dplyr-count.R +++ b/r/R/dplyr-count.R @@ -56,7 +56,7 @@ tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) { tally.Dataset <- tally.ArrowTabular <- tally.RecordBatchReader <- tally.arrow_dplyr_query -# we don't want to depend on dplyr, but we refrence these above +# we don't want to depend on dplyr, but we reference these above utils::globalVariables(c("n", "desc")) check_n_name <- function(name, diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index c14c67e70168c..d85fa16af2e71 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -28,20 +28,20 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) out$group_by_vars <- by$names } - filts <- expand_across(out, quos(...)) - if (length(filts) == 0) { + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { # Nothing to do return(as_adq(.data)) } # tidy-eval the filter expressions inside an Arrow data_mask - filters <- lapply(filts, arrow_eval, arrow_mask(out)) + filters <- lapply(expanded_filters, arrow_eval, arrow_mask(out)) bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) if (any(bad_filters)) { # This is similar to abandon_ship() except that the filter eval is # vectorized, and we apply filters that _did_ work before abandoning ship # with the rest - expr_labs <- map_chr(filts[bad_filters], format_expr) + expr_labs <- map_chr(expanded_filters[bad_filters], format_expr) if (query_on_dataset(out)) { # Abort. We don't want to auto-collect if this is a Dataset because that # could blow up, too big. @@ -71,7 +71,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) if (by$from_by) { out <- dplyr::ungroup(out) } - return(dplyr::filter(out, !!!filts[bad_filters], .by = {{ .by }})) + return(dplyr::filter(out, !!!expanded_filters[bad_filters], .by = {{ .by }})) } } diff --git a/r/R/dplyr-funcs-augmented.R b/r/R/dplyr-funcs-augmented.R index 116248d2dd92a..dca5ca16fa437 100644 --- a/r/R/dplyr-funcs-augmented.R +++ b/r/R/dplyr-funcs-augmented.R @@ -18,7 +18,7 @@ #' Add the data filename as a column #' #' This function only exists inside `arrow` `dplyr` queries, and it only is -#' valid when quering on a `FileSystemDataset`. +#' valid when querying on a `FileSystemDataset`. #' #' To use filenames generated by this function in subsequent pipeline steps, you #' must either call \code{\link[dplyr:compute]{compute()}} or diff --git a/r/R/dplyr-funcs-conditional.R b/r/R/dplyr-funcs-conditional.R index cd0245eeee182..b9639f00295ce 100644 --- a/r/R/dplyr-funcs-conditional.R +++ b/r/R/dplyr-funcs-conditional.R @@ -55,7 +55,7 @@ register_bindings_conditional <- function() { } if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) { - # store the NA_real_ in the same type as arg to avoid avoid casting + # store the NA_real_ in the same type as arg to avoid casting # smaller float types to larger float types NA_expr <- Expression$scalar(Scalar$create(NA_real_, type = arg$type())) Expression$create("if_else", Expression$create("is_nan", arg), NA_expr, arg) diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 5b6e16d376554..440210afd630c 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -459,7 +459,7 @@ register_bindings_datetime_timezone <- function() { roll_dst[1], "error" = 0L, "boundary" = 2L, - arrow_not_supported("`roll_dst` value must be 'error' or 'boundary' for non-existent times; other values") + arrow_not_supported("`roll_dst` value must be 'error' or 'boundary' for nonexistent times; other values") ) ambiguous <- switch( @@ -467,7 +467,7 @@ register_bindings_datetime_timezone <- function() { "error" = 0L, "pre" = 1L, "post" = 2L, - arrow_not_supported("`roll_dst` value must be 'error', 'pre', or 'post' for non-existent times") + arrow_not_supported("`roll_dst` value must be 'error', 'pre', or 'post' for nonexistent times") ) if (identical(tzone, "")) { diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R index 3cd8f94476e5e..9f3220e557f08 100644 --- a/r/R/dplyr-funcs-string.R +++ b/r/R/dplyr-funcs-string.R @@ -516,7 +516,7 @@ register_bindings_string_other <- function() { msg = "`stop` must be length 1 - other lengths are not supported in Arrow" ) - # substr treats values as if they're on a continous number line, so values + # substr treats values as if they're on a continuous number line, so values # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics # this behavior if (start <= 0) { diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 0bd340d4be2dd..f244682737cb4 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -158,8 +158,8 @@ register_bindings_type_cast <- function() { if (identical(fix.empty.names, TRUE)) { names(args) <- make.names(names(args), unique = TRUE) } else { - name_emtpy <- names(args) == "" - names(args)[!name_emtpy] <- make.names(names(args)[!name_emtpy], unique = TRUE) + name_empty <- names(args) == "" + names(args)[!name_empty] <- make.names(names(args)[!name_empty], unique = TRUE) } } diff --git a/r/R/duckdb.R b/r/R/duckdb.R index bf3a57daf2f1e..9632e9bad1984 100644 --- a/r/R/duckdb.R +++ b/r/R/duckdb.R @@ -89,7 +89,7 @@ arrow_duck_connection <- function() { # but if we don't explicitly run dbDisconnect() the user gets a warning # that they may not expect (since they did not open a duckdb connection). # This bit of code will run when the package namespace is cleaned up (i.e., - # at exit). This is more reliable than .onUnload() or .onDetatch(), which + # at exit). This is more reliable than .onUnload() or .onDetach(), which # don't necessarily run on exit. reg.finalizer(arrow_duck_finalizer, function(...) { con <- getOption("arrow_duck_con") diff --git a/r/R/extension.R b/r/R/extension.R index 4419c8ba01642..59a02121fd18c 100644 --- a/r/R/extension.R +++ b/r/R/extension.R @@ -83,7 +83,7 @@ ExtensionArray$create <- function(x, type) { #' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray] #' with this extension type. #' -#' In addition, subclasses may override the following methos to customize +#' In addition, subclasses may override the following methods to customize #' the behaviour of extension classes. #' #' - `$deserialize_instance()`: This method is called when a new [ExtensionType] @@ -184,7 +184,7 @@ ExtensionType <- R6Class("ExtensionType", }, ToString = function() { # metadata is probably valid UTF-8 (e.g., JSON), but might not be - # and it's confusing to error when printing the object. This herustic + # and it's confusing to error when printing the object. This heuristic # isn't perfect (but subclasses should override this method anyway) metadata_raw <- self$extension_metadata() @@ -286,7 +286,7 @@ ExtensionType$create <- function(storage_type, #' "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow" #' is reserved for extension types defined by the Apache Arrow libraries. #' @param extension_metadata A [raw()] or [character()] vector containing the -#' serialized version of the type. Chatacter vectors must be length 1 and +#' serialized version of the type. Character vectors must be length 1 and #' are converted to UTF-8 before converting to [raw()]. #' @param type_class An [R6::R6Class] whose `$new()` class method will be #' used to construct a new instance of the type. diff --git a/r/R/feather.R b/r/R/feather.R index 3e390018c825f..474fc6118e44f 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -24,7 +24,7 @@ #' a legacy version available starting in 2016, and the Version 2 (V2), #' which is the Apache Arrow IPC file format. #' The default version is V2. -#' V1 files are distinct from Arrow IPC files and lack many feathures, +#' V1 files are distinct from Arrow IPC files and lack many features, #' such as the ability to store all Arrow data tyeps, and compression support. #' [write_ipc_file()] can only write V2 files. #' @@ -91,7 +91,7 @@ write_feather <- function(x, } } if (is.null(compression_level)) { - # Use -1 as sentinal for "default" + # Use -1 as sentinel for "default" compression_level <- -1L } compression_level <- as.integer(compression_level) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index e0f370ad601b3..c6f92cba1932c 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -156,7 +156,7 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' buckets if `$CreateDir()` is called on the bucket level (default `FALSE`). #' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete #' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`). -#' - `request_timeout`: Socket read time on Windows and MacOS in seconds. If +#' - `request_timeout`: Socket read time on Windows and macOS in seconds. If #' negative, the AWS SDK default (typically 3 seconds). #' - `connect_timeout`: Socket connection timeout in seconds. If negative, AWS #' SDK default is used (typically 1 second). diff --git a/r/R/parquet.R b/r/R/parquet.R index 74f51767a29c4..d92e913cb5db3 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -128,7 +128,7 @@ read_parquet <- function(file, #' - A named vector, to specify the value for the named columns, the default #' value for the setting is used when not supplied #' -#' The `compression` argument can be any of the following (case insensitive): +#' The `compression` argument can be any of the following (case-insensitive): #' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2". #' Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip" #' are almost always included. See [codec_is_available()]. diff --git a/r/R/udf.R b/r/R/udf.R index fe08f02812fd9..922095cceba6a 100644 --- a/r/R/udf.R +++ b/r/R/udf.R @@ -154,7 +154,7 @@ arrow_scalar_function <- function(fun, in_type, out_type, auto_convert = FALSE) sprintf( paste0( "Expected `fun` to accept %d argument(s)\n", - "but found a function that acccepts %d argument(s)\n", + "but found a function that accepts %d argument(s)\n", "Did you forget to include `context` as the first argument?" ), expected_n_args, diff --git a/r/configure b/r/configure index 4f09cfdc4419b..029fc004dfc4c 100755 --- a/r/configure +++ b/r/configure @@ -62,7 +62,7 @@ PKG_CONFIG_NAME="arrow" PKG_BREW_NAME="apache-arrow" PKG_TEST_HEADER="" -# Some env vars that control the build (all logical, case insensitive) +# Some env vars that control the build (all logical, case-insensitive) # Development mode, also increases verbosity in the bundled build ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'` # The bundled build compiles arrow C++ from source; FORCE ensures we don't pick up @@ -79,9 +79,6 @@ VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` UNAME=`uname -s` : ${PKG_CONFIG:="pkg-config"} -# These will only be set in the bundled build -S3_LIBS="" -GCS_LIBS="" # If in development mode, run the codegen script to render arrowExports.* if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then @@ -116,7 +113,9 @@ fi # Test if pkg-config is available to use if ${PKG_CONFIG} --version >/dev/null 2>&1; then PKG_CONFIG_AVAILABLE="true" + echo "*** pkg-config found." else + echo "*** pkg-config not found." PKG_CONFIG_AVAILABLE="false" ARROW_USE_PKG_CONFIG="false" fi @@ -245,12 +244,6 @@ do_bundled_build () { ${LIB_DIR}/pkgconfig/*.pc rm -f ${LIB_DIR}/pkgconfig/*.pc.bak fi - else - # This case must be ARROW_DEPENDENCY_SOURCE=BUNDLED. - # These would be identified by pkg-config, in Requires.private and Libs.private. - # Rather than try to re-implement pkg-config, we can just hard-code them here. - S3_LIBS="-lcurl -lssl -lcrypto" - GCS_LIBS="-lcurl -lssl -lcrypto" fi else # If the library directory does not exist, the script must not have been successful @@ -293,15 +286,15 @@ set_pkg_vars () { # If we have pkg-config, it will tell us what libarrow needs set_lib_dir_with_pc () { - LIB_DIR="`${PKG_CONFIG} --variable=libdir --silence-errors ${PKG_CONFIG_NAME}`" + LIB_DIR="`${PKG_CONFIG} --variable=libdir ${PKG_CONFIG_NAME}`" } set_pkg_vars_with_pc () { pkg_config_names="${PKG_CONFIG_NAME} ${PKG_CONFIG_NAMES_FEATURES}" - PKG_CFLAGS="`${PKG_CONFIG} --cflags --silence-errors ${pkg_config_names}` $PKG_CFLAGS" + PKG_CFLAGS="`${PKG_CONFIG} --cflags ${pkg_config_names}` $PKG_CFLAGS" PKG_CFLAGS="$PKG_CFLAGS $PKG_CFLAGS_FEATURES" - PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other --silence-errors ${pkg_config_names}` + PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other ${pkg_config_names}` PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" - PKG_DIRS=`${PKG_CONFIG} --libs-only-L --silence-errors ${pkg_config_names}` + PKG_DIRS=`${PKG_CONFIG} --libs-only-L ${pkg_config_names}` } # If we don't have pkg-config, we can make some inferences @@ -322,7 +315,7 @@ set_pkg_vars_without_pc () { if [ -n "$(find "$LIB_DIR" -name 'libarrow_bundled_dependencies.*')" ]; then PKG_LIBS="$PKG_LIBS -larrow_bundled_dependencies" fi - PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" + PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES $SSL_LIBS_WITHOUT_PC" # If on Raspberry Pi, need to manually link against latomic # See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81358 for similar example @@ -380,11 +373,13 @@ add_feature_flags () { fi if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" - PKG_LIBS_FEATURES="$PKG_LIBS_FEATURES $S3_LIBS" fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" - PKG_LIBS_FEATURES="$PKG_LIBS_FEATURES $GCS_LIBS" + fi + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + # If pkg-config is available it will handle this for us automatically + SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" fi fi } @@ -405,6 +400,18 @@ find_or_build_libarrow # Now set `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` based on that. if [ "$_LIBARROW_FOUND" != "false" ] && [ "$_LIBARROW_FOUND" != "" ]; then set_pkg_vars ${_LIBARROW_FOUND} + + # If we didn't find any libraries with pkg-config, try again without pkg-config + FOUND_PKG_LIBS=`echo "$PKG_LIBS" | tr -d '[:space:]'` + if [ -z "$FOUND_PKG_LIBS" ] && [ "$PKG_CONFIG_AVAILABLE" = "true" ]; then + echo "*** pkg-config failed to find libraries. Running detection without pkg-config." + PKG_CONFIG_AVAILABLE="false" + set_pkg_vars ${_LIBARROW_FOUND} + fi +else + # To make it easier to debug which code path was taken add a specific + # message to the log in addition to the 'NOTE' + echo "*** Failed to find Arrow C++ libraries." fi # Test that we can compile something with those flags @@ -413,7 +420,7 @@ CXX17FLAGS=`"${R_HOME}"/bin/R CMD config CXX17FLAGS` CXX17STD=`"${R_HOME}"/bin/R CMD config CXX17STD` CPPFLAGS=`"${R_HOME}"/bin/R CMD config CPPFLAGS` TEST_CMD="${CXX17} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX17FLAGS} ${CXX17STD} -xc++ -" -echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} >/dev/null 2>&1 +TEST_ERROR=$(echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} -o /dev/null 2>&1) if [ $? -eq 0 ]; then # Prepend PKG_DIRS to PKG_LIBS and write to Makevars @@ -428,7 +435,12 @@ else echo "------------------------- NOTE ---------------------------" echo "There was an issue preparing the Arrow C++ libraries." echo "See https://arrow.apache.org/docs/r/articles/install.html" - echo "---------------------------------------------------------" + echo "----------------------------------------------------------" + echo "" + echo "Test compile error: ${TEST_ERROR}" + echo "Failing compile command: ${TEST_CMD}" + echo "PKG_CFLAGS=$PKG_CFLAGS" + echo "PKG_LIBS=$PKG_LIBS" PKG_LIBS="" PKG_CFLAGS="" exit 1 diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd index 032a4a76bf80b..aef4d01d7539e 100644 --- a/r/man/ExtensionType.Rd +++ b/r/man/ExtensionType.Rd @@ -26,7 +26,7 @@ extension metadata as a UTF-8 encoded string. with this extension type. } -In addition, subclasses may override the following methos to customize +In addition, subclasses may override the following methods to customize the behaviour of extension classes. \itemize{ \item \verb{$deserialize_instance()}: This method is called when a new \link{ExtensionType} diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index b71d95f423ee3..dbf89ef1387ac 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -57,7 +57,7 @@ in the background, without blocking (default \code{TRUE}) buckets if \verb{$CreateDir()} is called on the bucket level (default \code{FALSE}). \item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will delete buckets if\verb{$DeleteDir()} is called on the bucket level (default \code{FALSE}). -\item \code{request_timeout}: Socket read time on Windows and MacOS in seconds. If +\item \code{request_timeout}: Socket read time on Windows and macOS in seconds. If negative, the AWS SDK default (typically 3 seconds). \item \code{connect_timeout}: Socket connection timeout in seconds. If negative, AWS SDK default is used (typically 1 second). diff --git a/r/man/add_filename.Rd b/r/man/add_filename.Rd index 93718435a2042..1fe10ea4f8f26 100644 --- a/r/man/add_filename.Rd +++ b/r/man/add_filename.Rd @@ -12,7 +12,7 @@ augmented column. } \description{ This function only exists inside \code{arrow} \code{dplyr} queries, and it only is -valid when quering on a \code{FileSystemDataset}. +valid when querying on a \code{FileSystemDataset}. } \details{ To use filenames generated by this function in subsequent pipeline steps, you diff --git a/r/man/codec_is_available.Rd b/r/man/codec_is_available.Rd index 5cda813f41673..e79b5724b8b17 100644 --- a/r/man/codec_is_available.Rd +++ b/r/man/codec_is_available.Rd @@ -8,7 +8,7 @@ codec_is_available(type) } \arguments{ \item{type}{A string, one of "uncompressed", "snappy", "gzip", "brotli", -"zstd", "lz4", "lzo", or "bz2", case insensitive.} +"zstd", "lz4", "lzo", or "bz2", case-insensitive.} } \value{ Logical: is \code{type} available? diff --git a/r/man/io_thread_count.Rd b/r/man/io_thread_count.Rd index 6cd44e1f6ea94..ae9297bb57761 100644 --- a/r/man/io_thread_count.Rd +++ b/r/man/io_thread_count.Rd @@ -11,7 +11,7 @@ set_io_thread_count(num_threads) } \arguments{ \item{num_threads}{integer: New number of threads for thread pool. At least -two threads are reccomended to support all operations in the arrow +two threads are recommended to support all operations in the arrow package.} } \description{ diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd index 6d0f27c321991..a7307e538b940 100644 --- a/r/man/new_extension_type.Rd +++ b/r/man/new_extension_type.Rd @@ -32,7 +32,7 @@ array.} is reserved for extension types defined by the Apache Arrow libraries.} \item{extension_metadata}{A \code{\link[=raw]{raw()}} or \code{\link[=character]{character()}} vector containing the -serialized version of the type. Chatacter vectors must be length 1 and +serialized version of the type. Character vectors must be length 1 and are converted to UTF-8 before converting to \code{\link[=raw]{raw()}}.} \item{type_class}{An \link[R6:R6Class]{R6::R6Class} whose \verb{$new()} class method will be diff --git a/r/man/open_dataset.Rd b/r/man/open_dataset.Rd index 7c3d32289f73e..7028f38467303 100644 --- a/r/man/open_dataset.Rd +++ b/r/man/open_dataset.Rd @@ -142,7 +142,7 @@ what names to give the virtual columns that come from the path segments. The default behavior in \code{open_dataset()} is to inspect the file paths contained in the provided directory, and if they look like Hive-style, parse -them as Hive. If your dataset has Hive-style partioning in the file paths, +them as Hive. If your dataset has Hive-style partitioning in the file paths, you do not need to provide anything in the \code{partitioning} argument to \code{open_dataset()} to use them. If you do provide a character vector of partition column names, they will be ignored if they match what is detected, diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index 999f2d265b7fd..b56d445c9e2e3 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -230,7 +230,7 @@ be dropped. Note that if you are specifying column names, whether by \code{schema} or \code{col_names}, and the CSV file has a header row that would otherwise be used -to idenfity column names, you'll need to add \code{skip = 1} to skip that row. +to identify column names, you'll need to add \code{skip = 1} to skip that row. } \examples{ diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 78cf60b67477f..0d3a7da3b90b4 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -59,7 +59,7 @@ and to make sharing data across data analysis languages easy. a legacy version available starting in 2016, and the Version 2 (V2), which is the Apache Arrow IPC file format. The default version is V2. -V1 files are distinct from Arrow IPC files and lack many feathures, +V1 files are distinct from Arrow IPC files and lack many features, such as the ability to store all Arrow data tyeps, and compression support. \code{\link[=write_ipc_file]{write_ipc_file()}} can only write V2 files. } diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index af976b1aabf81..480abb12fcf4a 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -86,7 +86,7 @@ value for each column, in positional order value for the setting is used when not supplied } -The \code{compression} argument can be any of the following (case insensitive): +The \code{compression} argument can be any of the following (case-insensitive): "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2". Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip" are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}. diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 615a84511fca9..88289e72004b3 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "14.0.0.9000 (dev)", + "name": "14.0.1.9000 (dev)", "version": "dev/" }, { - "name": "14.0.0 (release)", + "name": "14.0.1 (release)", "version": "" }, { diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index ae435d54d6cbb..9745393d01bbc 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -152,12 +152,15 @@ struct AltrepVectorBase { const char* class_name = CHAR(PRINTNAME(data_class_sym)); if (IsMaterialized(alt)) { - Rprintf("materialized %s len=%d\n", class_name, Rf_xlength(Representation(alt))); + Rprintf("materialized %s len=%ld\n", class_name, + static_cast(Rf_xlength(Representation(alt)))); // NOLINT: runtime/int } else { const auto& chunked_array = GetChunkedArray(alt); - Rprintf("%s<%p, %s, %d chunks, %d nulls> len=%d\n", class_name, chunked_array.get(), + Rprintf("%s<%p, %s, %d chunks, %ld nulls> len=%ld\n", class_name, + reinterpret_cast(chunked_array.get()), chunked_array->type()->ToString().c_str(), chunked_array->num_chunks(), - chunked_array->null_count(), chunked_array->length()); + static_cast(chunked_array->null_count()), // NOLINT: runtime/int + static_cast(chunked_array->length())); // NOLINT: runtime/int } return TRUE; @@ -744,7 +747,7 @@ struct AltrepVectorString : public AltrepVectorBase> { // Helper class to convert to R strings. We declare one of these for the // class to avoid having to stack-allocate one for every STRING_ELT call. // This class does not own a reference to any arrays: it is the caller's - // responsibility to ensure the Array lifetime exeeds that of the viewer. + // responsibility to ensure the Array lifetime exceeds that of the viewer. struct RStringViewer { RStringViewer() : strip_out_nuls_(false), nul_was_stripped_(false) {} @@ -819,7 +822,7 @@ struct AltrepVectorString : public AltrepVectorBase> { "'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul " "= TRUE)"; - Rf_error(stripped_string_.c_str()); + Rf_error("%s", stripped_string_.c_str()); } void SetArray(const std::shared_ptr& array) { diff --git a/r/src/safe-call-into-r.h b/r/src/safe-call-into-r.h index 319d46d11f0d6..0ffd1d16dca01 100644 --- a/r/src/safe-call-into-r.h +++ b/r/src/safe-call-into-r.h @@ -141,15 +141,15 @@ class MainRThread { MainRThread() : initialized_(false), executor_(nullptr), stop_source_(nullptr) {} }; -// This object is used to ensure that signal hanlders are registered when +// This object is used to ensure that signal handlers are registered when // RunWithCapturedR launches its background thread to call Arrow and is // cleaned up however this exits. Note that the lifecycle of the StopSource, // which is registered at package load, is not necessarily tied to the // lifecycle of the signal handlers. The general approach is to register // the signal handlers only when we are evaluating code outside the R thread // (when we are evaluating code *on* the R thread, R's signal handlers are -// sufficient and will signal an interupt condition that will propagate -// via a cpp11::unwind_excpetion). +// sufficient and will signal an interrupt condition that will propagate +// via a cpp11::unwind_exception). class WithSignalHandlerContext { public: WithSignalHandlerContext() : signal_handler_registered_(false) { diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index 8d39f7252ee21..e277c645d456e 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -37,7 +37,7 @@ with_language <- function(lang, expr) { skip_on_cran() old <- Sys.getenv("LANGUAGE") # Check what this message is before changing languages; this will - # trigger caching the transations if the OS does that (some do). + # trigger caching the translations if the OS does that (some do). # If the OS does cache, then we can't test changing languages safely. before <- i18ize_error_messages() Sys.setenv(LANGUAGE = lang) diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 3d68dac5af69b..bd29080848184 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -38,11 +38,11 @@ skip_if_not_available <- function(feature) { skip_on_linux_devel() } - # curl/ssl on MacOS is too old to support S3 filesystems without + # curl/ssl on macOS is too old to support S3 filesystems without # crashing when the process exits. if (feature == "s3") { if (on_macos_10_13_or_lower()) { - skip("curl/ssl runtime on MacOS 10.13 is too old") + skip("curl/ssl runtime on macOS 10.13 is too old") } } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index b29c1f4e09dde..bb005605de318 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -371,19 +371,19 @@ test_that("support for NaN (ARROW-3615)", { expect_equal(y$null_count, 1L) }) -test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", { +test_that("is.nan() evaluates to FALSE on NA (for consistency with base R)", { x <- c(1.0, NA, NaN, -1.0) compare_expression(is.nan(.input), x) }) -test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", { +test_that("is.nan() evaluates to FALSE on non-floats (for consistency with base R)", { x <- c(1L, 2L, 3L) y <- c("foo", "bar") compare_expression(is.nan(.input), x) compare_expression(is.nan(.input), y) }) -test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", { +test_that("is.na() evaluates to TRUE on NaN (for consistency with base R)", { x <- c(1, NA, NaN, -1) compare_expression(is.na(.input), x) }) diff --git a/r/tests/testthat/test-backwards-compatibility.R b/r/tests/testthat/test-backwards-compatibility.R index 8210bd2e78fd8..5f804b02dcee7 100644 --- a/r/tests/testthat/test-backwards-compatibility.R +++ b/r/tests/testthat/test-backwards-compatibility.R @@ -22,7 +22,7 @@ # To write a new version of a test file for an old version, use docker(-compose) # to setup a linux distribution and use RStudio's public package manager binary # repo to install the old version. The following commands should be run at the -# root of the arrow repo directory and might need slight adjusments. +# root of the arrow repo directory and might need slight adjustments. # R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose build --no-cache r # R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose run r /bin/bash # R diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R index 28ff308747584..9f69380c55b3b 100644 --- a/r/tests/testthat/test-dataset-write.R +++ b/r/tests/testthat/test-dataset-write.R @@ -139,7 +139,7 @@ test_that("Writing a dataset: Parquet->Parquet (default)", { ) }) -test_that("Writing a dataset: `basename_template` default behavier", { +test_that("Writing a dataset: `basename_template` default behavior", { ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") dst_dir <- make_temp_dir() @@ -840,7 +840,7 @@ test_that("Writing a dataset to text files with wrapper functions.", { expect_equal(new_ds %>% collect(), df) }) -test_that("Writing a flat file dataset: `basename_template` default behavier", { +test_that("Writing a flat file dataset: `basename_template` default behavior", { ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") dst_dir <- make_temp_dir() diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index e707a194a3626..4d3226798d3ff 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1550,7 +1550,7 @@ test_that("as.difftime()", { ) # only integer (or integer-like) -> duration conversion supported in Arrow. - # double -> duration not supported. we're not testing the content of the + # double -> duration not supported. We aren't testing the content of the # error message as it is being generated in the C++ code and it might change, # but we want to make sure that this error is raised in our binding implementation expect_error( @@ -1961,7 +1961,7 @@ test_that("`as.Date()` and `as_date()`", { # `as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg # to `as.Date()` # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object - # passsed if`tz` is NULL + # passed if`tz` is NULL compare_dplyr_binding( .input %>% transmute( @@ -2831,7 +2831,7 @@ test_that("parse_date_time with truncated formats", { }) test_that("parse_date_time with `locale != NULL` not supported", { - # parse_date_time currently doesn't take locale paramete which will be + # parse_date_time currently doesn't take locale parameter which will be # addressed in https://issues.apache.org/jira/browse/ARROW-17147 skip_if_not_available("re2") @@ -3038,7 +3038,7 @@ test_that("build_formats() and build_format_from_order()", { # an "easy" date to avoid conflating tests of different things (i.e., it's # UTC time, and not one of the edge cases on or extremely close to the -# rounding boundaty) +# rounding boundary) easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC") easy_df <- tibble::tibble(datetime = easy_date) @@ -3703,7 +3703,7 @@ test_that("with_tz() and force_tz() works", { roll_dst = "post") ) %>% collect(), - "roll_dst` value must be 'error' or 'boundary' for non-existent times" + "roll_dst` value must be 'error' or 'boundary' for nonexistent times" ) expect_warning( @@ -3716,7 +3716,7 @@ test_that("with_tz() and force_tz() works", { ) ) %>% collect(), - "`roll_dst` value must be 'error', 'pre', or 'post' for non-existent times" + "`roll_dst` value must be 'error', 'pre', or 'post' for nonexistent times" ) # Raise error when the timezone falls into the DST-break diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 29993711921e2..b2b2a9e54695d 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -355,7 +355,7 @@ test_that("Functions that take ... but we only accept a single arg", { test_that("median()", { # When medians are integer-valued, stats::median() sometimes returns output of - # type integer, whereas whereas the Arrow approx_median kernels always return + # type integer, whereas the Arrow approx_median kernels always return # output of type float64. The calls to median(int, ...) in the tests below # are enclosed in as.double() to work around this known difference. @@ -414,6 +414,8 @@ test_that("median()", { }) test_that("quantile()", { + skip_if_not_available("dataset") + # The default method for stats::quantile() throws an error when na.rm = FALSE # and the input contains NA or NaN, whereas the Arrow tdigest kernels return # null in this situation. To work around this known difference, the tests @@ -432,7 +434,7 @@ test_that("quantile()", { # returned by Arrow. # When quantiles are integer-valued, stats::quantile() sometimes returns - # output of type integer, whereas whereas the Arrow tdigest kernels always + # output of type integer, whereas the Arrow tdigest kernels always # return output of type float64. The calls to quantile(int, ...) in the tests # below are enclosed in as.double() to work around this known difference. @@ -510,9 +512,9 @@ test_that("quantile()", { ) # with a vector of 2+ probs - expect_warning( - Table$create(tbl) %>% - summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = TRUE)), + expect_error( + InMemoryDataset$create(data.frame(x = 1)) %>% + summarize(q = quantile(x, probs = c(0.2, 0.8), na.rm = TRUE)), "quantile() with length(probs) != 1 not supported in Arrow", fixed = TRUE ) @@ -839,7 +841,7 @@ test_that("Expressions on aggregations", { ) ) - # Check aggregates on aggeregates with more complex calls + # Check aggregates on aggregates with more complex calls expect_warning( record_batch(tbl) %>% summarise(any(any(!lgl))), paste( @@ -910,28 +912,24 @@ test_that("Not (yet) supported: implicit join", { compare_dplyr_binding( .input %>% - group_by(some_grouping) %>% - summarize( - dbl - mean(dbl) - ) %>% + group_by(x) %>% + summarize(y - mean(y)) %>% collect(), - tbl, + data.frame(x = 1, y = 2), warning = paste( - "Expression dbl - mean\\(dbl\\) is not an aggregate expression", + "Expression y - mean\\(y\\) is not an aggregate expression", "or is not supported in Arrow; pulling data into R" ) ) compare_dplyr_binding( .input %>% - group_by(some_grouping) %>% - summarize( - dbl - ) %>% + group_by(x) %>% + summarize(y) %>% collect(), - tbl, + data.frame(x = 1, y = 2), warning = paste( - "Expression dbl is not an aggregate expression", + "Expression y is not an aggregate expression", "or is not supported in Arrow; pulling data into R" ) ) @@ -939,14 +937,12 @@ test_that("Not (yet) supported: implicit join", { # This one could possibly be supported--in mutate() compare_dplyr_binding( .input %>% - group_by(some_grouping) %>% - summarize( - dbl - int - ) %>% + group_by(x) %>% + summarize(x - y) %>% collect(), - tbl, + data.frame(x = 1, y = 2, z = 3), warning = paste( - "Expression dbl - int is not an aggregate expression", + "Expression x - y is not an aggregate expression", "or is not supported in Arrow; pulling data into R" ) ) @@ -1188,12 +1184,12 @@ test_that("Can use across() within summarise()", { # across() doesn't work in summarise when input expressions evaluate to bare field references expect_warning( - example_data %>% + data.frame(x = 1, y = 2) %>% arrow_table() %>% - group_by(lgl) %>% + group_by(x) %>% summarise(across(everything())) %>% collect(), - regexp = "Expression int is not an aggregate expression or is not supported in Arrow; pulling data into R" + regexp = "Expression y is not an aggregate expression or is not supported in Arrow; pulling data into R" ) }) diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index 55a1f8d21eedb..8b3d7d8aaa902 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -256,7 +256,7 @@ test_that("RecordBatch can roundtrip extension types", { ) # check both column orders, since column order should stay in the same - # order whether the colunns are are extension types or not + # order whether the columns are extension types or not mixed_record_batch2 <- record_batch( normal = normal_vctr, custom = custom_array @@ -296,7 +296,7 @@ test_that("Table can roundtrip extension types", { ) # check both column orders, since column order should stay in the same - # order whether the colunns are are extension types or not + # order whether the columns are extension types or not mixed_table2 <- arrow_table( normal = normal_vctr, custom = custom_array diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 03cbfbc5e91a8..1794acee70d22 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -#### Fuctions #### check end of file for main logic +#### Functions #### check end of file for main logic env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) # Log messages in the style of the configure script @@ -72,7 +72,7 @@ find_latest_nightly <- function(description_version, lg("Failed to find latest nightly for %s", description_version) latest <- description_version } else { - lg("Found latest nightly for %s: %s", description_version, res) + lg("Latest available nightly for %s: %s", description_version, res) latest <- res } latest @@ -81,8 +81,9 @@ find_latest_nightly <- function(description_version, try_download <- function(from_url, to_file, hush = quietly) { # We download some fairly large files, so ensure the timeout is set appropriately. # This assumes a static library size of 100 MB (generous) and a download speed - # of 1 MB/s (slow). - opts <- options(timeout = max(100, getOption("timeout"))) + # of .3 MB/s (slow). This is to anticipate slower user connections or load on + # artifactory servers. + opts <- options(timeout = max(300, getOption("timeout"))) on.exit(options(opts)) status <- try( @@ -99,16 +100,12 @@ download_binary <- function(lib) { libfile <- paste0("arrow-", VERSION, ".zip") binary_url <- paste0(arrow_repo, "bin/", lib, "/arrow-", VERSION, ".zip") if (try_download(binary_url, libfile)) { - if (!quietly) { - lg("Successfully retrieved C++ binaries (%s)", lib) - } + lg("Successfully retrieved libarrow (%s)", lib) } else { - if (!quietly) { lg( - "Downloading libarrow binary failed for version %s (%s)\n at %s", + "Downloading libarrow failed for version %s (%s)\n at %s", VERSION, lib, binary_url ) - } libfile <- NULL } # Explicitly setting the env var to "false" will skip checksum validation @@ -140,11 +137,11 @@ download_binary <- function(lib) { checksum_ok <- system2(checksum_cmd, args = checksum_args) if (checksum_ok != 0) { - cat("*** Checksum validation failed for libarrow binary: ", libfile, "\n") + lg("Checksum validation failed for libarrow: %s/%s", lib, libfile) unlink(libfile) libfile <- NULL } else { - cat("*** Checksum validated successfully for libarrow binary: ", libfile, "\n") + lg("Checksum validated successfully for libarrow: %s/%s", lib, libfile) } } @@ -899,7 +896,7 @@ download_libarrow_ok <- download_ok && !env_is("LIBARROW_DOWNLOAD", "false") thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tools/thirdparty_dependencies") arrow_versioned <- paste0("arrow-", VERSION) -# configure.win uses a different libarrow dir and and the zip is already nested +# configure.win uses a different libarrow dir and the zip is already nested if (on_windows) { lib_dir <- "windows" dst_dir <- lib_dir diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R index f97a80ccc2917..ed5192d806990 100644 --- a/r/tools/test-nixlibs.R +++ b/r/tools/test-nixlibs.R @@ -176,7 +176,7 @@ test_that("find_latest_nightly()", { find_latest_nightly(package_version("13.0.1.9000"), list_uri = tf_uri), package_version("13.0.0.100000335") ), - "Found latest nightly" + "Latest available nightly" ) expect_output( @@ -184,7 +184,7 @@ test_that("find_latest_nightly()", { find_latest_nightly(package_version("14.0.0.9000"), list_uri = tf_uri), package_version("14.0.0.100000001") ), - "Found latest nightly" + "Latest available nightly" ) expect_output( diff --git a/r/tools/update-checksums.R b/r/tools/update-checksums.R index 2aa9df317166f..f41652e87849e 100644 --- a/r/tools/update-checksums.R +++ b/r/tools/update-checksums.R @@ -38,6 +38,7 @@ if (!file.exists(tasks_yml)) { stop("Run this script from the r/ directory of the arrow repo") } +cat("Extracting libarrow binary paths from tasks.yml\n") # Get the libarrow binary paths from the tasks.yml file binary_paths <- readLines(tasks_yml) |> grep("r-lib__libarrow", x = _, value = TRUE) |> @@ -48,20 +49,24 @@ binary_paths <- readLines(tasks_yml) |> artifactory_root <- "https://apache.jfrog.io/artifactory/arrow/r/%s/libarrow/bin/%s" -# Get the checksuym file from the artifactory +# Get the checksum file from the artifactory for (path in binary_paths) { sha_path <- paste0(path, ".sha512") file <- file.path("tools/checksums", sha_path) dirname(file) |> dir.create(path = _, recursive = TRUE, showWarnings = FALSE) - + + cat(paste0("Downloading ", sha_path, "\n")) url <- sprintf(artifactory_root, VERSION, sha_path) download.file(url, file, quiet = TRUE, cacheOK = FALSE) if (grepl("windows", path)) { + cat(paste0("Converting ", path, " to windows style line endings\n")) # UNIX style line endings cause errors with mysys2 sha512sum - sed_status <- system2("sed", args = c("-i", "s/\\r//", file)) + sed_status <- system2("sed", args = c("-i", "s/\\\\r//", file)) if (sed_status != 0) { stop("Failed to remove \\r from windows checksum file. Exit code: ", sed_status) } } } + +cat("Checksums updated successfully!\n") diff --git a/r/vignettes/arrow.Rmd b/r/vignettes/arrow.Rmd index c218b08ede77b..50329334ce8b0 100644 --- a/r/vignettes/arrow.Rmd +++ b/r/vignettes/arrow.Rmd @@ -66,7 +66,7 @@ as.data.frame(dat) When this coercion takes place, each of the columns in the original Arrow Table must be converted to native R data objects. In the `dat` Table, for instance, `dat$x` is stored as the Arrow data type int32 inherited from C++, which becomes an R integer type when `as.data.frame()` is called. -It is possible to exercise fine grained control over this conversion process. To learn more about the different types and how they are converted, see the [data types](./data_types.html) article. +It is possible to exercise fine-grained control over this conversion process. To learn more about the different types and how they are converted, see the [data types](./data_types.html) article. ## Reading and writing data diff --git a/r/vignettes/data_objects.Rmd b/r/vignettes/data_objects.Rmd index 7fcef8e6e78c6..065745182df04 100644 --- a/r/vignettes/data_objects.Rmd +++ b/r/vignettes/data_objects.Rmd @@ -259,7 +259,7 @@ write_parquet(df_b, file.path(ds_dir_b, "part-0.parquet")) write_parquet(df_c, file.path(ds_dir_c, "part-0.parquet")) ``` -If we had wanted to, we could have further subdivided the dataset. A folder could contain multiple files (`part-0.parquet`, `part-1.parquet`, etc) if we wanted it to. Similarly, there is no particular reason to name the files `part-0.parquet` this way at all: it would have been fine to call these files `subset-a.parquet`, `subset-b.parquet`, and `subset-c.parquet` if we had wished. We could have written other file formats if we wanted, and we don't necessarily have to use Hive-style folders. You can learn more about the supported formats by reading the help documentation for `open_dataset()`, and learn about how to exercise fine grained control with `help("Dataset", package = "arrow")`. +If we had wanted to, we could have further subdivided the dataset. A folder could contain multiple files (`part-0.parquet`, `part-1.parquet`, etc) if we wanted it to. Similarly, there is no particular reason to name the files `part-0.parquet` this way at all: it would have been fine to call these files `subset-a.parquet`, `subset-b.parquet`, and `subset-c.parquet` if we had wished. We could have written other file formats if we wanted, and we don't necessarily have to use Hive-style folders. You can learn more about the supported formats by reading the help documentation for `open_dataset()`, and learn about how to exercise fine-grained control with `help("Dataset", package = "arrow")`. In any case, we have created an on-disk parquet Dataset using Hive-style partitioning. Our Dataset is defined by these files: diff --git a/r/vignettes/data_types.Rmd b/r/vignettes/data_types.Rmd index 6cbe7c72e6809..4b5ee01b6ab83 100644 --- a/r/vignettes/data_types.Rmd +++ b/r/vignettes/data_types.Rmd @@ -34,7 +34,7 @@ When the arrow package converts between R data and Arrow data, it will first che knitr::include_graphics("./data_types.png") ``` -In this image, black boxes refer to R data types and light blue boxes refer to Arrow data types. Directional arrows specify conversions (e.g., the bidirectional arrow between the logical R type and the boolean Arrow type means that R logicals convert to Arrow booleans and vice versa). Solid lines indicate that the this conversion rule is always the default; dashed lines mean that it only sometimes applies (the rules and special cases are described below). +In this image, black boxes refer to R data types and light blue boxes refer to Arrow data types. Directional arrows specify conversions (e.g., the bidirectional arrow between the logical R type and the boolean Arrow type means that the logical R converts to an Arrow boolean and vice versa). Solid lines indicate that this conversion rule is always the default; dashed lines mean that it only sometimes applies (the rules and special cases are described below). ## Logical/boolean types diff --git a/r/vignettes/data_wrangling.Rmd b/r/vignettes/data_wrangling.Rmd index e3d5b306f3e71..305a91c156eb1 100644 --- a/r/vignettes/data_wrangling.Rmd +++ b/r/vignettes/data_wrangling.Rmd @@ -165,7 +165,7 @@ sw2 %>% transmute(name, height, mass, res = residuals(lm(mass ~ height))) ``` -Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise opertation like `filter()` is not supported: +Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise operation like `filter()` is not supported: ```{r} sw %>% diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index de33e72407792..8e7cff7410473 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -46,18 +46,18 @@ not possible to link to a system version of libarrow during development). ## Option 1: Using nightly libarrow binaries -On Linux, MacOS, and Windows you can use the same workflow you might use for another +On Linux, macOS, and Windows you can use the same workflow you might use for another package that contains compiled code (e.g., `R CMD INSTALL .` from a terminal, `devtools::load_all()` from an R prompt, or `Install & Restart` from RStudio). If the `arrow/r/libarrow` directory is not populated, the configure script will attempt to download the latest nightly libarrow binary, extract it to the -`arrow/r/libarrow` directory (MacOS, Linux) or `arrow/r/windows` +`arrow/r/libarrow` directory (macOS, Linux) or `arrow/r/windows` directory (Windows), and continue building the R package as usual. Most of the time, you won't need to update your version of libarrow because the R package rarely changes with updates to the C++ library; however, if you start to get errors when rebuilding the R package, you may have to remove the -`libarrow` directory (MacOS, Linux) or `windows` directory (Windows) +`libarrow` directory (macOS, Linux) or `windows` directory (Windows) and do a "clean" rebuild. You can do this from a terminal with `R CMD INSTALL . --preclean`, from RStudio using the "Clean and Install" option from "Build" tab, or using `make clean` if you are using the `Makefile` diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index a21a7864f7d73..50278af25bd1b 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -14,7 +14,7 @@ This article provides an overview of working with both S3 and GCS data using the ## S3 and GCS support on Linux -Before you start, make sure that your arrow install has support for S3 and/or GCS enabled. For most users this will be true by default, because the Windows and MacOS binary packages hosted on CRAN include S3 and GCS support. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow install has support for S3 and/or GCS enabled. For most users this will be true by default, because the Windows and macOS binary packages hosted on CRAN include S3 and GCS support. You can check whether support is enabled via helper functions: ```r arrow_with_s3() @@ -307,7 +307,7 @@ Sys.unsetenv("AWS_S3_ENDPOINT") ``` By default, the AWS SDK tries to retrieve metadata about user configuration, -which can cause conficts when passing in connection details via URI (for example +which can cause conflicts when passing in connection details via URI (for example when accessing a MINIO bucket). To disable the use of AWS environment variables, you can set environment variable `AWS_EC2_METADATA_DISABLED` to `TRUE`. diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 10155e3a8cd5b..df43a9de36fc2 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -10,9 +10,9 @@ In most cases, `install.packages("arrow")` should just work. There are things yo ## Background -The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or MacOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. +The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or macOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. -This article outlines the recommend approaches to installing arrow on Linux, starting from the simplest and least customizable to the most complex but with more flexbility to customize your installation. +This article outlines the recommend approaches to installing arrow on Linux, starting from the simplest and least customizable to the most complex but with more flexibility to customize your installation. The primary audience for this document is arrow R package _users_ on Linux, and not Arrow _developers_. Additional resources for developers are listed at the end of this article. @@ -225,7 +225,7 @@ already present (when set to `AUTO`, the default). These dependencies vary by platform; however, if you wish to install these yourself prior to libarrow installation, we recommend that you take a look at the [docker file for whichever of our CI builds](https://github.com/apache/arrow/tree/main/ci/docker) -(the ones ending in "cpp" are for building Arrow's C++ libaries, aka libarrow) +(the ones ending in "cpp" are for building Arrow's C++ libraries, aka libarrow) corresponds most closely to your setup. This will contain the most up-to-date information about dependencies and minimum versions. diff --git a/r/vignettes/read_write.Rmd b/r/vignettes/read_write.Rmd index 15b2392b8ee5c..0ee695a6f4907 100644 --- a/r/vignettes/read_write.Rmd +++ b/r/vignettes/read_write.Rmd @@ -140,7 +140,7 @@ write_csv_arrow(mtcars, file_path) read_csv_arrow(file_path, col_select = starts_with("d")) ``` -In addition to the options provided by the readr-style arguments (`delim`, `quote`, `escape_doubple`, `escape_backslash`, etc), you can use the `schema` argument to specify column types: see `schema()` help for details. There is also the option of using `parse_options`, `convert_options`, and `read_options` to exercise fine-grained control over the arrow csv reader: see `help("CsvReadOptions", package = "arrow")` for details. +In addition to the options provided by the readr-style arguments (`delim`, `quote`, `escape_double`, `escape_backslash`, etc), you can use the `schema` argument to specify column types: see `schema()` help for details. There is also the option of using `parse_options`, `convert_options`, and `read_options` to exercise fine-grained control over the arrow csv reader: see `help("CsvReadOptions", package = "arrow")` for details. ## JSON format diff --git a/ruby/red-arrow/lib/arrow/array.rb b/ruby/red-arrow/lib/arrow/array.rb index e7ca5606761f2..2c5e5cf2754eb 100644 --- a/ruby/red-arrow/lib/arrow/array.rb +++ b/ruby/red-arrow/lib/arrow/array.rb @@ -250,7 +250,7 @@ def resolve(other_array) "[array][resolve] need to implement " + "a feature that building #{value_data_type} array " + "from raw Ruby Array" - raise NotImpelemented, message + raise NotImplemented, message end other_array elsif other_array.respond_to?(:value_data_type) diff --git a/ruby/red-arrow/lib/arrow/sort-key.rb b/ruby/red-arrow/lib/arrow/sort-key.rb index 7ceab631ea23c..e1df50ebb7c68 100644 --- a/ruby/red-arrow/lib/arrow/sort-key.rb +++ b/ruby/red-arrow/lib/arrow/sort-key.rb @@ -79,9 +79,9 @@ def try_convert(value) # target and corresponding order is used. `"+"` uses ascending # order and `"-"` uses ascending order. # - # If `target` is not a String nor `target` doesn't start with the - # leading order mark, sort column target is `target` as-is and - # ascending order is used. + # If `target` is either not a String or `target` doesn't start + # with the leading order mark, sort column is `target` as-is + # and ascending order is used. # # @example String without the leading order mark # key = Arrow::SortKey.new("count") diff --git a/swift/Arrow/Package.swift b/swift/Arrow/Package.swift index 065afe62640ea..946eb999c798a 100644 --- a/swift/Arrow/Package.swift +++ b/swift/Arrow/Package.swift @@ -32,7 +32,11 @@ let package = Package( targets: ["Arrow"]), ], dependencies: [ - .package(url: "https://github.com/google/flatbuffers.git", from: "23.3.3") + // The latest version of flatbuffers v23.5.26 was built in May 26, 2023 + // and therefore doesn't include the unaligned buffer swift changes. + // This can be changed back to using the tag once a new version of + // flatbuffers has been released. + .package(url: "https://github.com/google/flatbuffers.git", branch: "master") ], targets: [ // Targets are the basic building blocks of a package. A target can define a module or a test suite. diff --git a/swift/Arrow/Sources/Arrow/ArrowReader.swift b/swift/Arrow/Sources/Arrow/ArrowReader.swift index ef995b18052a8..d9dc1bdb470e6 100644 --- a/swift/Arrow/Sources/Arrow/ArrowReader.swift +++ b/swift/Arrow/Sources/Arrow/ArrowReader.swift @@ -132,7 +132,8 @@ public class ArrowReader { } public func fromStream( // swiftlint:disable:this function_body_length - _ fileData: Data + _ fileData: Data, + useUnalignedBuffers: Bool = false ) -> Result { let footerLength = fileData.withUnsafeBytes { rawBuffer in rawBuffer.loadUnaligned(fromByteOffset: fileData.count - 4, as: Int32.self) @@ -141,7 +142,9 @@ public class ArrowReader { let result = ArrowReaderResult() let footerStartOffset = fileData.count - Int(footerLength + 4) let footerData = fileData[footerStartOffset...] - let footerBuffer = ByteBuffer(data: footerData) + let footerBuffer = ByteBuffer( + data: footerData, + allowReadingUnalignedBuffers: useUnalignedBuffers) let footer = org_apache_arrow_flatbuf_Footer.getRootAsFooter(bb: footerBuffer) let schemaResult = loadSchema(footer.schema!) switch schemaResult { @@ -170,7 +173,9 @@ public class ArrowReader { let messageStartOffset = recordBatch.offset + (Int64(MemoryLayout.size) * messageOffset) let messageEndOffset = messageStartOffset + Int64(messageLength) let recordBatchData = fileData[messageStartOffset ..< messageEndOffset] - let mbb = ByteBuffer(data: recordBatchData) + let mbb = ByteBuffer( + data: recordBatchData, + allowReadingUnalignedBuffers: useUnalignedBuffers) let message = org_apache_arrow_flatbuf_Message.getRootAsMessage(bb: mbb) switch message.headerType { case .recordbatch: @@ -219,9 +224,12 @@ public class ArrowReader { public func fromMessage( _ dataHeader: Data, dataBody: Data, - result: ArrowReaderResult + result: ArrowReaderResult, + useUnalignedBuffers: Bool = false ) -> Result { - let mbb = ByteBuffer(data: dataHeader) + let mbb = ByteBuffer( + data: dataHeader, + allowReadingUnalignedBuffers: useUnalignedBuffers) let message = org_apache_arrow_flatbuf_Message.getRootAsMessage(bb: mbb) switch message.headerType { case .schema: diff --git a/swift/Arrow/Sources/Arrow/Schema_generated.swift b/swift/Arrow/Sources/Arrow/Schema_generated.swift index 990d9c3df18b1..c6fd6bf1d98b4 100644 --- a/swift/Arrow/Sources/Arrow/Schema_generated.swift +++ b/swift/Arrow/Sources/Arrow/Schema_generated.swift @@ -1023,7 +1023,7 @@ public struct org_apache_arrow_flatbuf_Time: FlatBufferObject, Verifiable { /// no indication of how to map this information to a physical point in time. /// Naive date-times must be handled with care because of this missing /// information, and also because daylight saving time (DST) may make -/// some values ambiguous or non-existent. A naive date-time may be +/// some values ambiguous or nonexistent. A naive date-time may be /// stored as a struct with Date and Time fields. However, it may also be /// encoded into a Timestamp column with an empty timezone. The timestamp /// values should be computed "as if" the timezone of the date-time values diff --git a/swift/Arrow/Tests/ArrowTests/IPCTests.swift b/swift/Arrow/Tests/ArrowTests/IPCTests.swift index 4ea252b78d8f6..59cad94ef4da5 100644 --- a/swift/Arrow/Tests/ArrowTests/IPCTests.swift +++ b/swift/Arrow/Tests/ArrowTests/IPCTests.swift @@ -295,7 +295,7 @@ final class IPCFileReaderTests: XCTestCase { } } - func testBinaryInMemroyToFromStream() throws { + func testBinaryInMemoryToFromStream() throws { let dataset = try makeBinaryDataset() let writerInfo = ArrowWriter.Info(.recordbatch, schema: dataset.0, batches: [dataset.1]) let arrowWriter = ArrowWriter() @@ -324,7 +324,7 @@ final class IPCFileReaderTests: XCTestCase { } } - func testTimeInMemroyToFromStream() throws { + func testTimeInMemoryToFromStream() throws { let dataset = try makeTimeDataset() let writerInfo = ArrowWriter.Info(.recordbatch, schema: dataset.0, batches: [dataset.1]) let arrowWriter = ArrowWriter() diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift index ca505869ec774..ef3e4fa239e84 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightClient.swift @@ -24,8 +24,11 @@ import Arrow public class FlightClient { let client: Arrow_Flight_Protocol_FlightServiceAsyncClient - public init(channel: GRPCChannel) { + let allowReadingUnalignedBuffers: Bool + + public init(channel: GRPCChannel, allowReadingUnalignedBuffers: Bool = false ) { client = Arrow_Flight_Protocol_FlightServiceAsyncClient(channel: channel) + self.allowReadingUnalignedBuffers = allowReadingUnalignedBuffers } private func readMessages( @@ -34,7 +37,11 @@ public class FlightClient { let reader = ArrowReader() let arrowResult = ArrowReader.makeArrowReaderResult() for try await data in responseStream { - switch reader.fromMessage(data.dataHeader, dataBody: data.dataBody, result: arrowResult) { + switch reader.fromMessage( + data.dataHeader, + dataBody: data.dataBody, + result: arrowResult, + useUnalignedBuffers: allowReadingUnalignedBuffers) { case .success: continue case .failure(let error): @@ -48,17 +55,17 @@ public class FlightClient { private func writeBatches( _ requestStream: GRPCAsyncRequestStreamWriter, descriptor: FlightDescriptor, - recordBatchs: [RecordBatch] + recordBatches: [RecordBatch] ) async throws { let writer = ArrowWriter() - switch writer.toMessage(recordBatchs[0].schema) { + switch writer.toMessage(recordBatches[0].schema) { case .success(let schemaData): try await requestStream.send( FlightData( schemaData, dataBody: Data(), flightDescriptor: descriptor).toProtocol()) - for recordBatch in recordBatchs { + for recordBatch in recordBatches { switch writer.toMessage(recordBatch) { case .success(let data): try await requestStream.send( @@ -122,14 +129,14 @@ public class FlightClient { public func doPut( _ descriptor: FlightDescriptor, - recordBatchs: [RecordBatch], + recordBatches: [RecordBatch], closure: (FlightPutResult) throws -> Void) async throws { - if recordBatchs.isEmpty { + if recordBatches.isEmpty { throw ArrowFlightError.emptyCollection } let putCall = client.makeDoPutCall() - try await writeBatches(putCall.requestStream, descriptor: descriptor, recordBatchs: recordBatchs) + try await writeBatches(putCall.requestStream, descriptor: descriptor, recordBatches: recordBatches) var closureCalled = false for try await response in putCall.responseStream { try closure(FlightPutResult(response)) @@ -158,20 +165,20 @@ public class FlightClient { public func doExchange( _ descriptor: FlightDescriptor, - recordBatchs: [RecordBatch], + recordBatches: [RecordBatch], closure: (ArrowReader.ArrowReaderResult) throws -> Void) async throws { - if recordBatchs.isEmpty { + if recordBatches.isEmpty { throw ArrowFlightError.emptyCollection } let exchangeCall = client.makeDoExchangeCall() - try await writeBatches(exchangeCall.requestStream, descriptor: descriptor, recordBatchs: recordBatchs) + try await writeBatches(exchangeCall.requestStream, descriptor: descriptor, recordBatches: recordBatches) try closure(try await readMessages(exchangeCall.responseStream)) } - public func doExchange(fligthData: FlightData, closure: (FlightData) throws -> Void) async throws { + public func doExchange(flightData: FlightData, closure: (FlightData) throws -> Void) async throws { let exchangeCall = client.makeDoExchangeCall() - try await exchangeCall.requestStream.send(fligthData.toProtocol()) + try await exchangeCall.requestStream.send(flightData.toProtocol()) exchangeCall.requestStream.finish() for try await response in exchangeCall.responseStream { try closure(FlightData(response)) diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift index a34bf5c0acee9..19644d632e997 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightServer.swift @@ -63,6 +63,7 @@ public func schemaFromMessage(_ schemaData: Data) -> ArrowSchema? { } public protocol ArrowFlightServer: Sendable { + var allowReadingUnalignedBuffers: Bool { get } func listFlights(_ criteria: FlightCriteria, writer: FlightInfoStreamWriter) async throws func getFlightInfo(_ request: FlightDescriptor) async throws -> FlightInfo func getSchema(_ request: FlightDescriptor) async throws -> ArrowFlight.FlightSchemaResult @@ -73,6 +74,12 @@ public protocol ArrowFlightServer: Sendable { func doExchange(_ reader: RecordBatchStreamReader, writer: RecordBatchStreamWriter) async throws } +extension ArrowFlightServer { + var allowReadingUnalignedBuffers: Bool { + return false + } +} + public func makeFlightServer(_ handler: ArrowFlightServer) -> CallHandlerProvider { return InternalFlightServer(handler) } diff --git a/swift/ArrowFlight/Sources/ArrowFlight/FlightSql.pb.swift b/swift/ArrowFlight/Sources/ArrowFlight/FlightSql.pb.swift index 238092266904b..18b839fcbc0cc 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/FlightSql.pb.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/FlightSql.pb.swift @@ -476,7 +476,7 @@ enum Arrow_Flight_Protocol_Sql_SqlInfo: SwiftProtobuf.Enum { /// Retrieves a int64 value representing the maximum number of characters allowed for a column name. case sqlMaxColumnNameLength // = 543 - /// Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. + /// Retrieves a int64 value representing the maximum number of columns allowed in a GROUP BY clause. case sqlMaxColumnsInGroupBy // = 544 /// Retrieves a int64 value representing the maximum number of columns allowed in an index. @@ -1816,7 +1816,7 @@ extension Arrow_Flight_Protocol_Sql_SqlSupportsConvert: CaseIterable { ///* /// The JDBC/ODBC-defined type of any object. -/// All the values here are the sames as in the JDBC and ODBC specs. +/// All the values here are the same as in the JDBC and ODBC specs. enum Arrow_Flight_Protocol_Sql_XdbcDataType: SwiftProtobuf.Enum { typealias RawValue = Int case xdbcUnknownType // = 0 @@ -2104,7 +2104,7 @@ enum Arrow_Flight_Protocol_Sql_Nullable: SwiftProtobuf.Enum { case nullabilityNullable // = 1 ///* - /// Indicates that nullability of the fields can not be determined. + /// Indicates that nullability of the fields cannot be determined. case nullabilityUnknown // = 2 case UNRECOGNIZED(Int) @@ -2149,7 +2149,7 @@ enum Arrow_Flight_Protocol_Sql_Searchable: SwiftProtobuf.Enum { typealias RawValue = Int ///* - /// Indicates that column can not be used in a WHERE clause. + /// Indicates that column cannot be used in a WHERE clause. case none // = 0 ///* @@ -2489,7 +2489,7 @@ struct Arrow_Flight_Protocol_Sql_CommandGetDbSchemas { /// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size /// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable /// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. @@ -3249,7 +3249,7 @@ extension Arrow_Flight_Protocol_Sql_ActionEndSavepointRequest.EndSavepoint: Case /// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size /// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable /// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - GetFlightInfo: execute the query. @@ -3290,7 +3290,7 @@ struct Arrow_Flight_Protocol_Sql_CommandStatementQuery { /// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size /// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable /// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - GetFlightInfo: execute the query. @@ -3356,7 +3356,7 @@ struct Arrow_Flight_Protocol_Sql_TicketStatementQuery { /// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size /// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable /// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. -/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case-sensitive, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. @@ -3376,7 +3376,7 @@ struct Arrow_Flight_Protocol_Sql_CommandPreparedStatementQuery { /// /// Represents a SQL update query. Used in the command member of FlightDescriptor -/// for the the RPC call DoPut to cause the server to execute the included SQL update. +/// for the RPC call DoPut to cause the server to execute the included SQL update. struct Arrow_Flight_Protocol_Sql_CommandStatementUpdate { // SwiftProtobuf.Message conformance is added in an extension below. See the // `Message` and `Message+*Additions` files in the SwiftProtobuf library for @@ -3404,7 +3404,7 @@ struct Arrow_Flight_Protocol_Sql_CommandStatementUpdate { /// /// Represents a SQL update query. Used in the command member of FlightDescriptor -/// for the the RPC call DoPut to cause the server to execute the included +/// for the RPC call DoPut to cause the server to execute the included /// prepared statement handle as an update. struct Arrow_Flight_Protocol_Sql_CommandPreparedStatementUpdate { // SwiftProtobuf.Message conformance is added in an extension below. See the @@ -3611,7 +3611,7 @@ extension Arrow_Flight_Protocol_Sql_ActionCancelQueryResult.CancelResult: @unche // MARK: - Extension Properties -// Swift Extensions on the exteneded Messages to add easy access to the declared +// Swift Extensions on the extended Messages to add easy access to the declared // extension fields. The names are based on the extension field name from the proto // declaration. To avoid naming collisions, the names are prefixed with the name of // the scope where the extend directive occurs. diff --git a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift index 972d19435ddfc..464752dbcbeea 100644 --- a/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift +++ b/swift/ArrowFlight/Sources/ArrowFlight/RecordBatchStreamReader.swift @@ -27,10 +27,13 @@ public class RecordBatchStreamReader: AsyncSequence, AsyncIteratorProtocol { var descriptor: FlightDescriptor? var batchIndex = 0 var streamIterator: any AsyncIteratorProtocol + var useUnalignedBuffers: Bool let stream: GRPC.GRPCAsyncRequestStream - init(_ stream: GRPC.GRPCAsyncRequestStream) { + init(_ stream: GRPC.GRPCAsyncRequestStream, + useUnalignedBuffers: Bool = false) { self.stream = stream self.streamIterator = self.stream.makeAsyncIterator() + self.useUnalignedBuffers = useUnalignedBuffers } public func next() async throws -> (Arrow.RecordBatch?, FlightDescriptor?)? { @@ -55,7 +58,11 @@ public class RecordBatchStreamReader: AsyncSequence, AsyncIteratorProtocol { let dataBody = flightData.dataBody let dataHeader = flightData.dataHeader descriptor = FlightDescriptor(flightData.flightDescriptor) - switch reader.fromMessage(dataHeader, dataBody: dataBody, result: result) { + switch reader.fromMessage( + dataHeader, + dataBody: dataBody, + result: result, + useUnalignedBuffers: useUnalignedBuffers) { case .success(()): if result.batches.count > 0 { batches = result.batches diff --git a/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift b/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift index db33dfc38734a..8097388c7fde1 100644 --- a/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift +++ b/swift/ArrowFlight/Tests/ArrowFlightTests/FlightTest.swift @@ -278,7 +278,7 @@ public class FlightClientTester { let descriptor = FlightDescriptor(cmd: cmd.data(using: .utf8)!) let rb = try makeRecordBatch() var numCall = 0 - try await client?.doPut(descriptor, recordBatchs: [rb], closure: { _ in + try await client?.doPut(descriptor, recordBatches: [rb], closure: { _ in numCall += 1 }) @@ -289,7 +289,7 @@ public class FlightClientTester { let descriptor = FlightDescriptor(cmd: "flight_ticket".data(using: .utf8)!) let rb = try makeRecordBatch() var numCall = 0 - try await client?.doExchange(descriptor, recordBatchs: [rb], closure: { result in + try await client?.doExchange(descriptor, recordBatches: [rb], closure: { result in numCall += 1 XCTAssertEqual(result.schema?.fields.count, 3) XCTAssertEqual(result.batches[0].length, 4)