diff --git a/.gitignore b/.gitignore index 5817efdcac091..6bb237af98ec7 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ apache-rat-*.jar arrow-src.tar +arrow-src.tar.gz # Compiled source *.a @@ -29,6 +30,8 @@ arrow-src.tar .build_cache_dir dependency-reduced-pom.xml MANIFEST +compile_commands.json +build.ninja # Generated Visual Studio files *.vcxproj @@ -36,10 +39,18 @@ MANIFEST *.sln *.iml +# Linux perf sample data +perf.data +perf.data.old + cpp/.idea/ cpp/apidoc/xml/ +docs/example.gz +docs/example1.dat +docs/example3.dat python/.eggs/ python/doc/ + .vscode .idea/ .pytest_cache/ diff --git a/.travis.yml b/.travis.yml index b877e205b5bd0..7abeb99e5207d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ # under the License. sudo: required -dist: trusty +dist: xenial services: - docker @@ -38,7 +38,6 @@ before_install: - | if [ $TRAVIS_OS_NAME == "linux" ]; then sudo bash -c "echo -e 'Acquire::Retries 10; Acquire::http::Timeout \"20\";' > /etc/apt/apt.conf.d/99-travis-retry" - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test sudo apt-get update -qq fi - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` @@ -46,12 +45,9 @@ before_install: matrix: fast_finish: true - allow_failures: - - jdk: oraclejdk9 - - language: r include: - # Lint C++, Python, R - - os: linux + - name: "Lint C++, Python, R" + os: linux language: python python: "3.6" env: @@ -62,70 +58,125 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh script: - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - # C++ & Python w/ gcc 4.9 - - compiler: gcc + - name: "C++ unit tests w/ Valgrind, clang 6.0" + language: cpp + os: linux + env: + - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 + - ARROW_TRAVIS_PLASMA=1 + - ARROW_TRAVIS_ORC=1 + - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 + - ARROW_BUILD_WARNING_LEVEL=CHECKIN + before_script: + - if [ $ARROW_CI_CPP_AFFECTED != "1" ]; then exit; fi + - export CC="clang-6.0" + - export CXX="clang++-6.0" + - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh + # If either C++ or Python changed, we must install the C++ libraries + - git submodule update --init + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + # Separating Valgrind and C++ coverage makes individual jobs shorter + - name: "C++ unit tests w/ gcc 5.4, coverage" + compiler: gcc language: cpp os: linux jdk: openjdk8 env: - ARROW_TRAVIS_USE_TOOLCHAIN=1 - - ARROW_TRAVIS_VALGRIND=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_COVERAGE=1 - ARROW_TRAVIS_PARQUET=1 - - ARROW_TRAVIS_PYTHON_DOCS=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - - ARROW_TRAVIS_PYTHON_JVM=1 - - ARROW_TRAVIS_JAVA_BUILD_ONLY=1 - - ARROW_TRAVIS_PYTHON_GANDIVA=1 - # ARROW-2999 Benchmarks are disabled in Travis CI for the time being - # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: - # (ARROW_CI_CPP_AFFECTED implies ARROW_CI_PYTHON_AFFECTED) - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - # All test steps are required for accurate C++ coverage info - - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - # Build Arrow Java to test the pyarrow<->JVM in-process bridge - - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - # Only run Plasma tests with valgrind in one of the Python builds because - # they are slow - - export PLASMA_VALGRIND=0 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - - export PLASMA_VALGRIND=1 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - # Gandiva C++ w/ gcc 4.9 and Java - - compiler: gcc + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh || travis_terminate 1 + - name: "C++ unit tests w/ gcc 4.8, trusty" + dist: trusty + compiler: gcc language: cpp os: linux jdk: openjdk8 env: - - ARROW_TRAVIS_GANDIVA=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_PLASMA=1 + - ARROW_TRAVIS_ORC=1 + - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" + before_install: + - ulimit -c unlimited -S + - | + if [ $TRAVIS_OS_NAME == "linux" ]; then + sudo bash -c "echo -e 'Acquire::Retries 10; Acquire::http::Timeout \"20\";' > /etc/apt/apt.conf.d/99-travis-retry" + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt-get update -qq + fi + - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` before_script: - # Run if something changed in CPP or Java. - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library + # If either C++ or Python changed, we must install the C++ libraries + - git submodule update --init + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - # [OS X] C++ & Python w/ XCode 6.4 - - compiler: clang + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh || travis_terminate 1 + - name: "Python 2.7 and 3.6 unit tests w/ Valgrind, gcc 5.4, coverage" + compiler: gcc language: cpp - osx_image: xcode6.4 + os: linux + jdk: openjdk8 + env: + # Valgrind is needed for the Plasma store tests + - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 + - ARROW_TRAVIS_COVERAGE=1 + - ARROW_TRAVIS_PYTHON_DOCS=1 + - ARROW_TRAVIS_PYTHON_JVM=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - ARROW_BUILD_WARNING_LEVEL=CHECKIN + - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 + # TODO(wesm): Run the benchmarks outside of Travis + # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 + before_script: + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ] && [ $ARROW_CI_DOCS_AFFECTED != "1" ]; then exit; fi + - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh || travis_terminate 1 + - ARROW_TRAVIS_PYTHON_GANDIVA=1 + # Only run Plasma tests with valgrind in one of the Python builds because + # they are slow + - export PLASMA_VALGRIND=0 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 + - export PLASMA_VALGRIND=1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh + - name: "[OS X] C++ w/ XCode 8.3" + compiler: clang + language: cpp + osx_image: xcode8.3 os: osx cache: addons: @@ -134,43 +185,50 @@ matrix: - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + # ARROW-3803: The Xcode 8.3 image has Boost libraries in /usr/local/lib + # which can get loaded before the toolchain Boost libraries. These seem to + # get loaded even though we are modifying LD_LIBRARY_PATH. We build our own + # Boost and statically link to get around the issue until this can be + # investigated further + - ARROW_TRAVIS_VENDORED_BOOST=1 before_script: - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - - if [ $ARROW_CI_CPP_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh; fi - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - # [OS X] Gandiva C++ w/ XCode 8.3 & Java - - compiler: clang + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh + - name: "[OS X] Python w/ XCode 7.3" + compiler: clang language: cpp - # xcode 7.3 has a bug in strptime. - osx_image: xcode8.3 + osx_image: xcode7.3 os: osx cache: addons: env: - - ARROW_TRAVIS_GANDIVA=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - MACOSX_DEPLOYMENT_TARGET="10.9" before_script: - # Run if something changed in CPP or Java. - - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library script: - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - # [manylinux1] Python - - language: cpp + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 + - name: "[manylinux1] Python" + language: cpp before_script: - if [ $ARROW_CI_PYTHON_AFFECTED == "1" ]; then docker pull quay.io/xhochy/arrow_manylinux1_x86_64_base:latest; fi script: - if [ $ARROW_CI_PYTHON_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_manylinux.sh; fi - # Java w/ OpenJDK 8 - - language: java + - name: "Java w/ OpenJDK 8" + language: java os: linux jdk: openjdk8 before_script: @@ -179,51 +237,55 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - $TRAVIS_BUILD_DIR/ci/travis_script_javadoc.sh - # Java w/ Oracle JDK 9 - - language: java + - name: "Java w/ OpenJDK 9" + language: java os: linux - jdk: oraclejdk9 + jdk: openjdk9 before_script: - if [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi script: - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - addons: - apt: - packages: - - oracle-java9-installer - # Integration w/ OpenJDK 8 - - language: java + - name: "Java w/ OpenJDK 11" + language: java + os: linux + jdk: openjdk11 + before_script: + - if [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh + - name: "Integration w/ OpenJDK 8" + language: java os: linux env: ARROW_TEST_GROUP=integration jdk: openjdk8 env: - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_PLASMA_JAVA_CLIENT=1 - - CC="clang-6.0" - - CXX="clang++-6.0" before_script: - if [ $ARROW_CI_INTEGRATION_AFFECTED != "1" ]; then exit; fi + - export CC="clang-6.0" + - export CXX="clang++-6.0" - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - nvm install 10.1 + - nvm install 11.6 - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_integration.sh - $TRAVIS_BUILD_DIR/ci/travis_script_plasma_java_client.sh - # NodeJS - - language: node_js + - name: "NodeJS" + language: node_js os: linux node_js: - - '10.1' + - '11.6' before_script: - if [ $ARROW_CI_JS_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_js.sh - # C++ & GLib & Ruby w/ gcc 4.9 - - compiler: gcc + - name: "C++ & GLib & Ruby w/ gcc 5.4" + compiler: gcc language: cpp os: linux env: @@ -232,8 +294,6 @@ matrix: - ARROW_TRAVIS_USE_VENDORED_BOOST=1 - ARROW_TRAVIS_PARQUET=1 - ARROW_TRAVIS_PLASMA=1 - - BUILD_TORCH_EXAMPLE=no - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: - if [ $ARROW_CI_RUBY_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh @@ -244,8 +304,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh - # [OS X] C++ & GLib & Ruby w/ XCode 8.3 & homebrew - - compiler: clang + - name: "[OS X] C++ & GLib & Ruby w/ XCode 8.3 & homebrew" + compiler: clang osx_image: xcode8.3 os: osx env: @@ -255,7 +315,6 @@ matrix: - ARROW_TRAVIS_PLASMA=1 cache: addons: - rvm: 2.2 before_script: - if [ $ARROW_CI_RUBY_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_osx.sh @@ -265,8 +324,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh - # Rust - - language: rust + - name: Rust + language: rust cache: cargo addons: apt: @@ -280,16 +339,15 @@ matrix: - if [ $ARROW_CI_RUST_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_cargo.sh script: - - RUSTUP_TOOLCHAIN=stable $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh || true - RUSTUP_TOOLCHAIN=nightly $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh after_success: - pushd ${TRAVIS_BUILD_DIR}/rust # Run coverage for codecov.io - mkdir -p target/kcov - - RUST_BACKTRACE=1 RUSTUP_TOOLCHAIN=stable cargo coverage --verbose + - RUST_BACKTRACE=1 RUSTUP_TOOLCHAIN=nightly cargo coverage --verbose - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - # Go - - language: go + - name: Go + language: go go_import_path: github.com/apache/arrow os: linux go: @@ -301,10 +359,13 @@ matrix: after_success: - pushd ${TRAVIS_BUILD_DIR}/go/arrow - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - # R - - language: r + - name: R + language: r cache: packages latex: false + dist: trusty + env: + - ARROW_TRAVIS_PARQUET=1 before_install: # Have to copy-paste this here because of how R's build steps work - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` @@ -317,7 +378,6 @@ matrix: fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library - - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib/pkgconfig - pushd ${TRAVIS_BUILD_DIR}/r diff --git a/CHANGELOG.md b/CHANGELOG.md index 853806cb0bcad..5cacdfdb219ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,610 @@ under the License. --> +# Apache Arrow 0.12.0 (16 January 2019) + +## Bug + +* ARROW-1847 - [Doc] Document the difference between RecordBatch and Table in an FAQ fashion +* ARROW-1994 - [Python] Test against Pandas master +* ARROW-2026 - [Python] Cast all timestamp resolutions to INT96 use\_deprecated\_int96\_timestamps=True +* ARROW-2038 - [Python] Follow-up bug fixes for s3fs Parquet support +* ARROW-2113 - [Python] Incomplete CLASSPATH with "hadoop" contained in it can fool the classpath setting HDFS logic +* ARROW-2591 - [Python] Segmentation fault when writing empty ListType column to Parquet +* ARROW-2592 - [Python] Error reading old Parquet file due to metadata backwards compatibility issue +* ARROW-2708 - [C++] Internal GetValues function in arrow::compute should check for nullptr +* ARROW-2970 - [Python] NumPyConverter::Visit for Binary/String/FixedSizeBinary can overflow +* ARROW-3058 - [Python] Feather reads fail with unintuitive error when conversion from pandas yields ChunkedArray +* ARROW-3186 - [GLib] mesonbuild failures in Travis CI +* ARROW-3202 - [C++] Build does not succeed on Alpine Linux +* ARROW-3225 - [C++/Python] Pandas object conversion of ListType and ListType +* ARROW-3324 - [Parquet] Free more internal resources when writing multiple row groups +* ARROW-3343 - [Java] Java tests fail non-deterministically with memory leak from Flight tests +* ARROW-3405 - [Python] Document CSV reader +* ARROW-3428 - [Python] from\_pandas gives incorrect results when converting floating point to bool +* ARROW-3436 - [C++] Boost version required by Gandiva is too new for Ubuntu 14.04 +* ARROW-3437 - [Gandiva][C++] Configure static linking of libgcc, libstdc++ with LDFLAGS +* ARROW-3438 - [Packaging] Escaped bulletpoints in changelog +* ARROW-3445 - [GLib] Parquet GLib doesn't link Arrow GLib +* ARROW-3449 - [C++] Support CMake 3.2 for "out of the box" builds +* ARROW-3466 - [Python] Crash when importing tensorflow and pyarrow +* ARROW-3467 - Building against external double conversion is broken +* ARROW-3470 - [C++] Row-wise conversion tutorial has fallen out of date +* ARROW-3477 - [C++] Testsuite fails on 32 bit arch +* ARROW-3480 - [Website] Install document for Ubuntu is broken +* ARROW-3485 - [C++] Examples fail with Protobuf error +* ARROW-3494 - [C++] re2 conda-forge package not working in toolchain +* ARROW-3516 - [C++] Use unsigned type for difference of pointers in parallel\_memcpy +* ARROW-3517 - [C++] MinGW 32bit build causes g++ segv +* ARROW-3524 - [C++] Fix compiler warnings from ARROW-3409 on clang-6 +* ARROW-3527 - [R] Unused variables in R-package C++ code +* ARROW-3528 - [R] Typo in R documentation +* ARROW-3535 - [Python] pip install tensorflow install too new numpy in manylinux1 build +* ARROW-3541 - [Rust] Update BufferBuilder to allow for new bit-packed BooleanArray +* ARROW-3544 - [Gandiva] Populate function registry in multiple compilation units to mitigate long compile times in release mode +* ARROW-3549 - [Rust] Replace i64 with usize for some bit utility functions +* ARROW-3573 - [Rust] with\_bitset does not set valid bits correctly +* ARROW-3580 - [Gandiva][C++] Build error with g++ 8.2.0 +* ARROW-3586 - [Python] Segmentation fault when converting empty table to pandas with categoricals +* ARROW-3598 - [Plasma] plasma\_store\_server fails linking with GPU enabled +* ARROW-3613 - [Go] Resize does not correctly update the length +* ARROW-3614 - [R] Handle Type::TIMESTAMP from Arrow to R +* ARROW-3658 - [Rust] validation of offsets buffer is incorrect for \`List\` +* ARROW-3670 - [C++] Use FindBacktrace to find execinfo.h support +* ARROW-3687 - [Rust] Anything measuring array slots should be \`usize\` +* ARROW-3698 - [C++] Segmentation fault when using a large table in Gandiva +* ARROW-3700 - [C++] CSV parser should allow ignoring empty lines +* ARROW-3703 - [Python] DataFrame.to\_parquet crashes if datetime column has time zones +* ARROW-3707 - [C++] test failure with zstd 1.3.7 +* ARROW-3711 - [C++] Don't pass CXX\_FLAGS to C\_FLAGS +* ARROW-3712 - [CI] License check regression (RAT failure) +* ARROW-3715 - [C++] gflags\_ep fails to build with CMake 3.13 +* ARROW-3716 - [R] Missing cases for ChunkedArray conversion +* ARROW-3728 - [Python] Merging Parquet Files - Pandas Meta in Schema Mismatch +* ARROW-3734 - [C++] Linking static zstd library fails on Arch x86-64 +* ARROW-3740 - [C++] Calling ArrayBuilder::Resize with length smaller than current appended length results in invalid state +* ARROW-3742 - Fix pyarrow.types & gandiva cython bindings +* ARROW-3745 - [C++] CMake passes static libraries multiple times to linker +* ARROW-3754 - [Packaging] Zstd configure error on linux package builds +* ARROW-3756 - [CI/Docker/Java] Java tests are failing in docker-compose setup +* ARROW-3762 - [C++] Parquet arrow::Table reads error when overflowing capacity of BinaryArray +* ARROW-3765 - [Gandiva] Segfault when the validity bitmap has not been allocated +* ARROW-3766 - [Python] pa.Table.from\_pandas doesn't use schema ordering +* ARROW-3768 - [Python] set classpath to hdfs not hadoop executable +* ARROW-3790 - [C++] Signed to unsigned integer cast yields incorrect results when type sizes are the same +* ARROW-3792 - [Python] Segmentation fault when writing empty RecordBatches to Parquet +* ARROW-3793 - [C++] TestScalarAppendUnsafe is not testing unsafe appends +* ARROW-3797 - [Rust] BinaryArray::value\_offset incorrect in offset case +* ARROW-3805 - [Gandiva] handle null validity bitmap in if-else expressions +* ARROW-3831 - [C++] arrow::util::Codec::Decompress() doesn't return decompressed data size +* ARROW-3835 - [C++] arrow::io::CompressedOutputStream::raw() impementation is missing +* ARROW-3837 - [C++] gflags link errors on Windows +* ARROW-3866 - [Python] Column metadata is not transferred to tables in pyarrow +* ARROW-3874 - [Gandiva] Cannot build: LLVM not detected correctly +* ARROW-3879 - [C++] cuda-test failure +* ARROW-3888 - [C++] Compilation warnings with gcc 7.3.0 +* ARROW-3889 - [Python] creating schema with invalid paramaters causes segmanetation fault +* ARROW-3890 - [Python] Creating Array with explicit string type fails on Python 2.7 +* ARROW-3894 - [Python] Error reading IPC file with no record batches +* ARROW-3898 - parquet-arrow example has compilation errors +* ARROW-3920 - Plasma reference counting not properly done in TensorFlow custom operator. +* ARROW-3931 - Make possible to build regardless of LANG +* ARROW-3936 - Add \_O\_NOINHERIT to the file open flags on Windows +* ARROW-3937 - [Rust] Rust nightly build is failing +* ARROW-3940 - [Python/Documentation] Add required packages to the development instruction +* ARROW-3941 - [R] RecordBatchStreamReader$schema +* ARROW-3942 - [R] Feather api fixes +* ARROW-3953 - Compat with pandas 0.24 rename of MultiIndex labels -> codes +* ARROW-3955 - [GLib] Add (transfer full) to free when no longer needed +* ARROW-3957 - [Python] Better error message when user connects to HDFS cluster with wrong port +* ARROW-3961 - [Python/Documentation] Fix wrong path in the pyarrow README +* ARROW-3969 - [Rust] CI build broken because rustfmt not available on nightly toolchain +* ARROW-3976 - [Ruby] Homebrew donation solicitation on CLI breaking CI builds +* ARROW-3977 - [Gandiva] gandiva cpp tests not running in CI +* ARROW-3979 - [Gandiva] fix all valgrind reported errors +* ARROW-3980 - [C++] Fix CRTP use in json-simple.cc +* ARROW-3989 - [Rust] CSV reader should handle case sensitivity for boolean values +* ARROW-3996 - [C++] Insufficient description on build +* ARROW-4008 - [C++] Integration test executable failure +* ARROW-4011 - [Gandiva] Refer irhelpers.bc in build directory +* ARROW-4019 - [C++] Fix coverity issues +* ARROW-4033 - [C++] thirdparty/download\_dependencies.sh uses tools or options not available in older Linuxes +* ARROW-4034 - [Ruby] Interface for FileOutputStream doesn't respect append=True +* ARROW-4041 - [CI] Python 2.7 run uses Python 3.6 +* ARROW-4049 - [C++] Arrow never use glog even though glog is linked. +* ARROW-4052 - [C++] Linker errors with glog and gflags +* ARROW-4053 - [Python/Integration] HDFS Tests failing with I/O operation on closed file +* ARROW-4055 - [Python] Fails to convert pytz.utc with versions 2018.3 and earlier +* ARROW-4058 - [C++] arrow-io-hdfs-test fails when run against HDFS cluster from docker-compose +* ARROW-4065 - [C++] arrowTargets.cmake is broken +* ARROW-4066 - Instructions to create Sphinx documentation +* ARROW-4070 - [C++] ARROW\_BOOST\_VENDORED doesn't work properly with ninja build +* ARROW-4073 - [Python] Parquet test failures on AppVeyor +* ARROW-4074 - [Python] test\_get\_library\_dirs\_win32 fails if libraries installed someplace different from conda or wheel packages +* ARROW-4078 - [CI] Run Travis job where documentation is built when docs/ is changed +* ARROW-4088 - [Python] Table.from\_batches() fails when passed a schema with metadata +* ARROW-4089 - [Plasma] The tutorial is wrong regarding the parameter type of PlasmaClient.Create +* ARROW-4101 - [C++] Binary identity cast not implemented +* ARROW-4106 - [Python] Tests fail to run because hypothesis update broke its API +* ARROW-4109 - [Packaging] Missing glog dependency from arrow-cpp conda recipe +* ARROW-4113 - [R] Version number patch broke build +* ARROW-4114 - [C++][DOCUMENTATION] +* ARROW-4115 - [Gandiva] valgrind complains that boolean output data buffer has uninited data +* ARROW-4118 - [Python] Error with "asv run" +* ARROW-4125 - [Python] ASV benchmarks fail to run if Plasma extension is not built (e.g. on Windows) +* ARROW-4126 - [Go] offset not used when accessing boolean array +* ARROW-4128 - [C++][DOCUMENTATION] Update style guide to reflect some more exceptions +* ARROW-4130 - [Go] offset not used when accessing binary array +* ARROW-4134 - [Packaging] Properly setup timezone in docker tests to prevent ORC adapter's abort +* ARROW-4135 - [Python] Can't reload a pandas dataframe containing a list of datetime.time +* ARROW-4138 - [Python] setuptools\_scm customization does not work for versions above 0.9.0 on Windows +* ARROW-4147 - [JAVA] Reduce heap usage for variable width vectors +* ARROW-4149 - [CI/C++] Parquet test misses ZSTD compression codec in CMake 3.2 nightly builds +* ARROW-4157 - [C++] -Wdocumentation failures with clang 6.0 on Ubuntu 18.04 +* ARROW-4171 - [Rust] fix parquet crate release version +* ARROW-4173 - JIRA library name is wrong in error message of dev/merge\_arrow\_pr.py +* ARROW-4178 - [C++] Fix TSan and UBSan errors +* ARROW-4179 - [Python] Tests crashing on all platforms in CI +* ARROW-4185 - [Rust] Appveyor builds are broken +* ARROW-4186 - [C++] BitmapWriters clobber the first byte when length=0 +* ARROW-4188 - [Rust] There should be a README in the top level rust directory +* ARROW-4197 - [C++] Emscripten compiler fails building Arrow +* ARROW-4200 - [C++] conda\_env\_\* files cannot be used to create a fresh conda environment on Windows +* ARROW-4209 - [Gandiva] returning IR structs causes issues with windows +* ARROW-4215 - [GLib] Fix typos in documentation +* ARROW-4227 - [GLib] Field in composite data type returns wrong data type +* ARROW-4237 - [Packaging] Fix CMAKE\_INSTALL\_LIBDIR in release verification script +* ARROW-4238 - [Packaging] Fix RC version conflict between crossbow and rake +* ARROW-4246 - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma +* ARROW-4256 - [Release] Update Windows verification script for 0.12 release +* ARROW-4258 - [Python] Safe cast fails from numpy float64 array with nans to integer +* ARROW-4260 - [Python] test\_serialize\_deserialize\_pandas is failing in multiple build entries + +## Improvement + +* ARROW-1423 - [C++] Create non-owned CudaContext from context handle provided by thirdparty user +* ARROW-1688 - [Java] Fail build on checkstyle warnings +* ARROW-1993 - [Python] Add function for determining implied Arrow schema from pandas.DataFrame +* ARROW-2211 - [C++] Use simpler hash functions for integers +* ARROW-2216 - [CI] CI descriptions and envars are misleading +* ARROW-2475 - [Format] Confusing array length description +* ARROW-2483 - [Rust] use bit-packing for boolean vectors +* ARROW-2504 - [Website] Add ApacheCon NA link +* ARROW-2624 - [Python] Random schema and data generator for Arrow conversion and Parquet testing +* ARROW-2637 - [C++/Python] Build support and instructions for development on Alpine Linux +* ARROW-2670 - [C++/Python] Add Ubuntu 18.04 / gcc7 as a nightly build +* ARROW-2673 - [Python] Add documentation + docstring for ARROW-2661 +* ARROW-2684 - [Python] Various documentation improvements +* ARROW-2759 - Export notification socket of Plasma +* ARROW-2803 - [C++] Put hashing function into src/arrow/util +* ARROW-2807 - [Python] Enable memory-mapping to be toggled in get\_reader when reading Parquet files +* ARROW-2808 - [Python] Add unit tests for ProxyMemoryPool, enable new default MemoryPool to be constructed +* ARROW-2919 - [C++] Improve error message when listing empty HDFS file +* ARROW-2968 - [R] Multi-threaded conversion from Arrow table to R data.frame +* ARROW-3038 - [Go] add support for StringArray +* ARROW-3063 - [Go] move list of supported/TODO features to confluence +* ARROW-3070 - [Release] Host binary artifacts for RCs and releases on ASF Bintray account instead of dist/mirror system +* ARROW-3131 - [Go] add test for Go-1.11 +* ARROW-3161 - [Packaging] Ensure to run pyarrow unit tests in conda and wheel builds +* ARROW-3169 - [C++] Break array-test.cc and array.cc into multiple compilation units +* ARROW-3199 - [Plasma] Check for EAGAIN in recvmsg and sendmsg +* ARROW-3209 - [C++] Rename libarrow\_gpu to libarrow\_cuda +* ARROW-3230 - [Python] Missing comparisons on ChunkedArray, Table +* ARROW-3233 - [Python] Sphinx documentation for pyarrow.cuda GPU support +* ARROW-3278 - [Python] Retrieve StructType's and StructArray's field by name +* ARROW-3291 - [C++] Convenience API for constructing arrow::io::BufferReader from std::string +* ARROW-3312 - [R] Use same .clang-format file for both R binding C++ code and main C++ codebase +* ARROW-3318 - [C++] Convenience method for reading all batches from an IPC stream or file as arrow::Table +* ARROW-3331 - [C++] Add re2 to ThirdpartyToolchain +* ARROW-3353 - [Packaging] Build python 3.7 wheels +* ARROW-3358 - [Gandiva][C++] Replace usages of gandiva/status.h with arrow/status.h +* ARROW-3362 - [R] Guard against null buffers +* ARROW-3366 - [R] Dockerfile for docker-compose setup +* ARROW-3368 - [Integration/CI/Python] Add dask integration test to docker-compose setup +* ARROW-3402 - [Gandiva][C++] Utilize common bitmap operation implementations in precompiled IR routines +* ARROW-3409 - [C++] Add streaming compression interfaces +* ARROW-3421 - [C++] Add include-what-you-use setup to primary docker-compose.yml +* ARROW-3429 - [Packaging] Add a script to release binaries that use source archive at dist.apache.orgtable bit +* ARROW-3430 - [Packaging] Add workaround to verify 0.11.0 +* ARROW-3431 - [GLib] Include Gemfile to archive +* ARROW-3432 - [Packaging] Variables aren't expanded Subversion commit message +* ARROW-3440 - [Gandiva][C++] Remove outdated cpp/src/gandiva/README.md, add build documentation to cpp/README.md +* ARROW-3441 - [Gandiva][C++] Produce fewer test executables +* ARROW-3442 - [C++] Use dynamic linking for unit tests, ensure coverage working properly with clang +* ARROW-3451 - [Python] Allocate CUDA memory from a CUcontext created by numba.cuda +* ARROW-3455 - [Gandiva][C++] Support pkg-config for Gandiva +* ARROW-3456 - [CI] Reuse docker images and optimize docker-compose containers +* ARROW-3460 - [Packaging] Add a script to rebase master on local release branch +* ARROW-3461 - [Packaging] Add a script to upload RC artifacts as the official release +* ARROW-3462 - [Packaging] Update CHANGELOG for 0.11.0 +* ARROW-3463 - [Website] Update for 0.11.0 +* ARROW-3465 - [Documentation] Fix gen\_apidocs' docker image +* ARROW-3473 - [Format] Update Layout.md document to clarify use of 64-bit array lengths +* ARROW-3474 - [GLib] Extend gparquet API with get\_schema and read\_column +* ARROW-3479 - [R] Support to write record\_batch as stream +* ARROW-3482 - [C++] Build with JEMALLOC by default +* ARROW-3488 - [Packaging] Separate crossbow task definition files for packaging and tests +* ARROW-3492 - [C++] Build jemalloc in parallel +* ARROW-3493 - [Java] Document BOUNDS\_CHECKING\_ENABLED +* ARROW-3506 - [Packaging] Nightly tests for docker-compose images +* ARROW-3518 - [C++] Detect HOMEBREW\_PREFIX automatically +* ARROW-3521 - [GLib] Run Python using find\_program in meson.build +* ARROW-3530 - [Java/Python] Add conversion for pyarrow.Schema from org.apache…pojo.Schema +* ARROW-3533 - [Python/Documentation] Use sphinx\_rtd\_theme instead of Bootstrap +* ARROW-3539 - [CI/Packaging] Update scripts to build against vendored jemalloc +* ARROW-3542 - [C++] Use unsafe appends when building array from CSV +* ARROW-3545 - [C++/Python] Normalize child/field terminology with StructType +* ARROW-3547 - [R] Protect against Null crash when reading from RecordBatch +* ARROW-3548 - Speed up storing small objects in the object store. +* ARROW-3551 - Change MapD to OmniSci on Powered By page +* ARROW-3556 - [CI] Disable optimizations on Windows +* ARROW-3557 - [Python] Set language\_level in Cython sources +* ARROW-3558 - [Plasma] Remove fatal error when plasma client calls get on an unsealed object that it created. +* ARROW-3559 - Statically link libraries for plasma\_store\_server executable. +* ARROW-3562 - [R] Disallow creation of objects with null shared\_ptr +* ARROW-3563 - [C++] Declare public link dependencies so arrow\_static, plasma\_static automatically pull in transitive dependencies +* ARROW-3566 - Clarify that the type of dictionary encoded field should be the encoded(index) type +* ARROW-3574 - Fix remaining bug with plasma static versus shared libraries. +* ARROW-3576 - [Python] Expose compressed file readers as NativeFile +* ARROW-3577 - [Go] add support for ChunkedArray +* ARROW-3581 - [Gandiva][C++] ARROW\_PROTOBUF\_USE\_SHARED isn't used +* ARROW-3582 - [CI] Gandiva C++ build is always triggered +* ARROW-3584 - [Go] add support for Table +* ARROW-3587 - [Python] Efficient serialization for Arrow Objects (array, table, tensor, etc) +* ARROW-3589 - [Gandiva] Make it possible to compile gandiva without JNI +* ARROW-3591 - [R] Support to collect decimal type +* ARROW-3600 - [Packaging] Support Ubuntu 18.10 +* ARROW-3601 - [Rust] Release 0.11.0 +* ARROW-3602 - [Gandiva] [Python] Add preliminary Cython bindings for Gandiva +* ARROW-3603 - [Gandiva][C++] Can't build with vendored Boost +* ARROW-3605 - Remove AE library from plasma header files. +* ARROW-3607 - [Java] delete() method via JNI for plasma +* ARROW-3611 - Give error more quickly when pyarrow serialization context is used incorrectly. +* ARROW-3612 - [Go] implement RecordBatch and RecordBatchReader +* ARROW-3615 - [R] Support for NaN +* ARROW-3618 - [Packaging/Documentation] Add \`-c conda-forge\` option to avoid PackagesNotFoundError +* ARROW-3620 - [Python] Document multithreading options in Sphinx and add to api.rst +* ARROW-3621 - [Go] implement TableBatchReader +* ARROW-3622 - [Go] implement Schema.Equal +* ARROW-3623 - [Go] implement Field.Equal +* ARROW-3624 - [Python/C++] Support for zero-sized device buffers +* ARROW-3626 - [Go] add a CSV TableReader +* ARROW-3629 - [Python] Add write\_to\_dataset to Python Sphinx API listing +* ARROW-3632 - [Packaging] Update deb names in dev/tasks/tasks.yml in dev/release/00-prepare.sh +* ARROW-3633 - [Packaging] Update deb names in dev/tasks/tasks.yml for 0.12.0 +* ARROW-3634 - [GLib] cuda.cpp compile error +* ARROW-3636 - [C++/Python] Update arrow/python/pyarrow\_api.h +* ARROW-3638 - [C++][Python] Move reading from Feather as Table feature to C++ from Python +* ARROW-3639 - [Packaging] Run gandiva nightly packaging tasks +* ARROW-3640 - [Go] add support for Tensors +* ARROW-3641 - [C++/Python] remove public keyword from Cython api functions +* ARROW-3642 - [C++] Add arrowConfig.cmake generation +* ARROW-3645 - [Python] Document compression support in Sphinx +* ARROW-3646 - [Python] Add convenience factories to create IO streams +* ARROW-3647 - [R] Crash after unloading bit64 package +* ARROW-3648 - [Plasma] Add API to get metadata and data at the same time +* ARROW-3649 - [Rust] Refactor MutableBuffer's resize +* ARROW-3656 - [C++] Allow whitespace in numeric CSV fields +* ARROW-3657 - [R] Require bit64 package +* ARROW-3659 - [C++] Clang Travis build (matrix entry 2) might not actually be using clang +* ARROW-3661 - [Gandiva][GLib] Improve constant name +* ARROW-3666 - [C++] Improve CSV parser performance +* ARROW-3672 - [Go] implement Time32 array +* ARROW-3673 - [Go] implement Time64 array +* ARROW-3674 - [Go] implement Date32 array +* ARROW-3675 - [Go] implement Date64 array +* ARROW-3677 - [Go] implement FixedSizedBinary array +* ARROW-3681 - [Go] add benchmarks for CSV reader +* ARROW-3682 - [Go] unexport encoding/csv.Reader from CSV reader +* ARROW-3683 - [Go] add functional-option style to CSV reader +* ARROW-3684 - [Go] add chunk size option to CSV reader +* ARROW-3693 - [R] Invalid buffer for empty characters with null data +* ARROW-3694 - [Java] Avoid superfluous string creation when logging level is disabled +* ARROW-3695 - [Gandiva] Use add\_arrow\_lib() +* ARROW-3696 - [C++] Add feather::TableWriter::Write(table) +* ARROW-3697 - [Ruby] Add schema#[] +* ARROW-3704 - [Gandiva] Can't build with g++ 8.2.0 +* ARROW-3708 - [Packaging] Nightly CentOS builds are failing +* ARROW-3718 - [Gandiva] Remove spurious gtest include +* ARROW-3719 - [GLib] Support read/write tabl to/from Feather +* ARROW-3720 - [GLib] Use "indices" instead of "indexes" +* ARROW-3721 - [Gandiva] [Python] Support all Gandiva literals +* ARROW-3722 - [C++] Allow specifying column types to CSV reader +* ARROW-3724 - [GLib] Update gitignore +* ARROW-3725 - [GLib] Add field readers to GArrowStructDataType +* ARROW-3727 - [Python] Document use of pyarrow.foreign\_buffer, cuda.foreign\_buffer in Sphinx +* ARROW-3733 - [GLib] Add to\_string() to GArrowTable and GArrowColumn +* ARROW-3736 - [CI/Docker] Ninja test in docker-compose run cpp hangs +* ARROW-3743 - [Ruby] Add support for saving/loading Feather +* ARROW-3744 - [Ruby] Use garrow\_table\_to\_string() in Arrow::Table#to\_s +* ARROW-3746 - [Gandiva] [Python] Make it possible to list all functions registered with Gandiva +* ARROW-3747 - [C++] Flip order of data members in arrow::Decimal128 +* ARROW-3748 - [GLib] Add GArrowCSVReader +* ARROW-3749 - [GLib] Typos in documentation and test case name +* ARROW-3751 - [Python] Add more cython bindings for gandiva +* ARROW-3752 - [C++] Remove unused status::ArrowError +* ARROW-3753 - [Gandiva] Remove debug print +* ARROW-3773 - [C++] Remove duplicated AssertArraysEqual code in parquet/arrow/arrow-reader-writer-test.cc +* ARROW-3778 - [C++] Don't put implementations in test-util.h +* ARROW-3781 - [C++] Configure buffer size in arrow::io::BufferedOutputStream +* ARROW-3784 - [R] Array with type fails with x is not a vector +* ARROW-3785 - [C++] Use double-conversion conda package in CI toolchain +* ARROW-3787 - Implement From for BinaryArray +* ARROW-3788 - [Ruby] Add support for CSV parser writtin in C++ +* ARROW-3795 - [R] Support for retrieving NAs from INT64 arrays +* ARROW-3796 - [Rust] Add Example for PrimitiveArrayBuilder +* ARROW-3800 - [C++] Vendor a string\_view backport +* ARROW-3803 - [C++/Python] Split C++ and Python unit test Travis CI jobs, run all C++ tests (including Gandiva) together +* ARROW-3819 - [Packaging] Update conda variant files to conform with feedstock after compiler migration +* ARROW-3821 - [Format/Documentation]: Fix typos and grammar issues in Flight.proto comments +* ARROW-3825 - [Python] The Python README.md does not show how to run the unit test suite +* ARROW-3834 - [Doc] Merge Python & C++ and move to top-level +* ARROW-3836 - [C++] Add PREFIX option to ADD\_ARROW\_BENCHMARK +* ARROW-3839 - [Rust] Add ability to infer schema in CSV reader +* ARROW-3841 - [C++] warning: catching polymorphic type by value +* ARROW-3845 - [Gandiva] [GLib] Add GGandivaNode +* ARROW-3847 - [GLib] Remove unnecessary “\”. +* ARROW-3849 - Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64. +* ARROW-3852 - [C++] used uninitialized warning +* ARROW-3853 - [C++] Implement string to timestamp cast +* ARROW-3854 - [GLib] Deprecate garrow\_gio\_{input,output}\_stream\_get\_raw() +* ARROW-3855 - [Rust] Schema/Field/Datatype should implement serde traits +* ARROW-3856 - [Ruby] Support compressed CSV save/load +* ARROW-3858 - [GLib] Use {class\_name}\_get\_instance\_private +* ARROW-3862 - [C++] Improve dependencies download script +* ARROW-3863 - [GLib] Use travis\_retry with brew bundle command +* ARROW-3865 - [Packaging] Add double-conversion dependency to conda forge recipes and the windows wheel build +* ARROW-3868 - [Rust] Build against nightly Rust in CI +* ARROW-3870 - [C++] Add Peek to InputStream API +* ARROW-3871 - [R] Replace usages of C++ GetValuesSafely with new methods on ArrayData +* ARROW-3878 - [Rust] Improve primitive types +* ARROW-3880 - [Rust] PrimitiveArray should support simple math operations +* ARROW-3883 - [Rust] Update Rust README to reflect new functionality +* ARROW-3884 - [Python] Add LLVM6 to manylinux1 base image +* ARROW-3885 - [Rust] Update version to 0.12.0 and update release instructions on wiki +* ARROW-3886 - [C++] Additional test cases for ARROW-3831 +* ARROW-3893 - [C++] Improve adaptive int builder performance +* ARROW-3895 - [Rust] CSV reader should return Result> not Option> +* ARROW-3905 - [Ruby] Add StructDataType#[] +* ARROW-3906 - [C++] Break builder.cc into multiple compilation units +* ARROW-3908 - [Rust] Update rust dockerfile to use nightly toolchain +* ARROW-3910 - [Python] Set date\_as\_object to True in \*.to\_pandas as default after deduplicating logic implemented +* ARROW-3911 - [Python] Deduplicate datetime.date objects in Table.to\_pandas internals +* ARROW-3913 - [Gandiva] [GLib] Add GGandivaLiteralNode +* ARROW-3914 - [C++/Python/Packaging] Docker-compose setup for Alpine linux +* ARROW-3922 - [C++] improve the performance of bitmap operations +* ARROW-3925 - [Python] Include autoconf in Linux/macOS dependencies in conda environment +* ARROW-3928 - [Python] Add option to deduplicate PyBytes / PyString / PyUnicode objects in Table.to\_pandas conversion path +* ARROW-3929 - [Go] improve memory usage of CSV reader to improve runtime performances +* ARROW-3930 - [C++] Random test data generation is slow +* ARROW-3932 - [Python/Documentation] Include Benchmarks.md in Sphinx docs +* ARROW-3934 - [Gandiva] Don't compile precompiled tests if ARROW\_GANDIVA\_BUILD\_TESTS=off +* ARROW-3950 - [Plasma] Don't force loading the TensorFlow op on import +* ARROW-3952 - [Rust] Specify edition="2018" in Cargo.toml +* ARROW-3958 - [Plasma] Reduce number of IPCs +* ARROW-3960 - [Rust] remove extern crate for Rust 2018 +* ARROW-3963 - [Packaging/Docker] Nightly test for building sphinx documentations +* ARROW-3964 - [Go] More readable example for csv.Reader +* ARROW-3967 - [Gandiva] [C++] Make gandiva/node.h public +* ARROW-3971 - [Python] Remove APIs deprecated in 0.11 and prior +* ARROW-3974 - [C++] Combine field\_builders\_ and children\_ members in array/builder.h +* ARROW-3982 - [C++] Allow "binary" input in simple JSON format +* ARROW-3984 - [C++] Exit with error if user hits zstd ExternalProject path +* ARROW-3986 - [C++] Write prose documentation +* ARROW-3988 - [C++] Do not build unit tests by default in build system +* ARROW-3994 - [C++] Remove ARROW\_GANDIVA\_BUILD\_TESTS option +* ARROW-3995 - [CI] Use understandable names in Travis Matrix +* ARROW-3997 - [C++] [Doc] Clarify dictionary encoding integer signedness (and width?) +* ARROW-4002 - [C++][Gandiva] Remove CMake version check +* ARROW-4004 - [GLib] Replace GPU with CUDA +* ARROW-4005 - [Plasma] [GLib] Add gplasma\_client\_disconnect() +* ARROW-4006 - Add CODE\_OF\_CONDUCT.md +* ARROW-4009 - [CI] Run Valgrind and C++ code coverage in different bulds +* ARROW-4015 - [Plasma] remove legacy interfaces for plasma manager +* ARROW-4017 - [C++] Check and update vendored libraries +* ARROW-4026 - [C++] Use separate modular $COMPONENT-test targets for unit tests +* ARROW-4029 - [C++] Define and document naming convention for internal / private header files not to be installed +* ARROW-4030 - [CI] Use travis\_terminate to halt builds when a step fails +* ARROW-4035 - [Ruby] Support msys2 mingw dependencies +* ARROW-4037 - [Packaging] Remove workaround to verify 0.11.0 +* ARROW-4038 - [Rust] Add array\_ops methods for boolean AND, OR, NOT +* ARROW-4042 - [Rust] Inconsistent method naming between BinaryArray and PrimitiveArray +* ARROW-4048 - [GLib] Return ChunkedArray instead of Array in gparquet\_arrow\_file\_reader\_read\_column +* ARROW-4051 - [Gandiva] [GLib] Add support for null literal +* ARROW-4054 - [Python] Update gtest, flatbuffers and OpenSSL in manylinux1 base image +* ARROW-4069 - [Python] Add tests for casting from binary to utf8 +* ARROW-4080 - [Rust] Improving lengthy build times in Appveyor +* ARROW-4082 - [C++] CMake tweaks: allow RelWithDebInfo, improve FindClangTools +* ARROW-4084 - [C++] Simplify Status and stringstream boilerplate +* ARROW-4085 - [GLib] Use "field" for struct data type +* ARROW-4087 - [C++] Make CSV nulls configurable +* ARROW-4093 - [C++] Deprecated method suggests wrong method +* ARROW-4098 - [Python] Deprecate pyarrow.open\_stream,open\_file in favor of pa.ipc.open\_stream/open\_file +* ARROW-4102 - [C++] FixedSizeBinary identity cast not implemented +* ARROW-4103 - [Documentation] Add README to docs/ root +* ARROW-4105 - Add rust-toolchain to enforce user to use nightly toolchain for building +* ARROW-4107 - [Python] Use ninja in pyarrow manylinux1 build +* ARROW-4116 - [Python] Clarify in development.rst that virtualenv cannot be used with miniconda/Anaconda +* ARROW-4122 - [C++] Initialize some uninitialized class members +* ARROW-4127 - [Documentation] Add Docker build instructions +* ARROW-4129 - [Python] Fix syntax problem in benchmark docs +* ARROW-4152 - [GLib] Remove an example to show Torch integration +* ARROW-4155 - [Rust] Implement array\_ops::sum() for PrimitiveArray +* ARROW-4158 - [Dev] Allow maintainers to use a GitHub API token when merging pull requests +* ARROW-4160 - [Rust] Add README and executable files to parquet +* ARROW-4168 - [GLib] Use property to keep GArrowDataType passed in garrow\_field\_new() +* ARROW-4177 - [C++] Add ThreadPool and TaskGroup microbenchmarks +* ARROW-4191 - [C++] Use same CC and AR for jemalloc as for the main sources +* ARROW-4199 - [GLib] Add garrow\_seekable\_input\_stream\_peek() +* ARROW-4207 - [Gandiva] [GLib] Add support for IfNode +* ARROW-4211 - [GLib] Add GArrowFixedSizeBinaryDataType +* ARROW-4216 - [Python] Add CUDA API docs +* ARROW-4228 - [GLib] Add garrow\_list\_data\_type\_get\_field() +* ARROW-4229 - [Packaging] Set crossbow target explicitly to enable building arbitrary arrow repo +* ARROW-4233 - [Packaging] Create a Dockerfile to build source archive +* ARROW-4240 - [Packaging] Documents for Plasma GLib and Gandiva GLib are missing in source archive +* ARROW-4243 - [Python] Test failure with pandas 0.24.0rc1 +* ARROW-4249 - [Plasma] Remove reference to logging.h from plasma/common.h +* ARROW-4257 - [Release] Update release verification script to check binaries on Bintray +* ARROW-4269 - [Python] AttributeError: module 'pandas.core' has no attribute 'arrays' +* ARROW-912 - [Python] Account for multiarch systems in development.rst + +## New Feature + +* ARROW-1019 - [C++] Implement input stream and output stream with Gzip codec +* ARROW-1492 - [C++] Type casting function kernel suite +* ARROW-1696 - [C++] Add codec benchmarks +* ARROW-2712 - [C#] Initial C# .NET library +* ARROW-3020 - [Python] Addition of option to allow empty Parquet row groups +* ARROW-3108 - [C++] arrow::PrettyPrint for Table instances +* ARROW-3126 - [Python] Make Buffered\* IO classes available to Python, incorporate into input\_stream, output\_stream factory functions +* ARROW-3184 - [C++] Add modular build targets, "all" target, and require explicit target when invoking make or ninja +* ARROW-3303 - [C++] Enable example arrays to be written with a simplified JSON representation +* ARROW-3306 - [R] Objects and support functions different kinds of arrow::Buffer +* ARROW-3307 - [R] Convert chunked arrow::Column to R vector +* ARROW-3310 - [R] Create wrapper classes for various Arrow IO interfaces +* ARROW-3340 - [R] support for dates and time classes +* ARROW-3355 - [R] Support for factors +* ARROW-3380 - [Python] Support reading CSV files and more from a gzipped file +* ARROW-3381 - [C++] Implement InputStream for bz2 files +* ARROW-3387 - [C++] Function to cast binary to string/utf8 with UTF8 validation +* ARROW-3398 - [Rust] Update existing Builder to use MutableBuffer internally +* ARROW-3407 - [C++] Add UTF8 conversion modes in CSV reader conversion options +* ARROW-3439 - [R] R language bindings for Feather format +* ARROW-3450 - [R] Wrap MemoryMappedFile class +* ARROW-3490 - [R] streaming arrow objects to output streams +* ARROW-3499 - [R] Expose arrow::ipc::Message type +* ARROW-3504 - [Plasma] Add support for Plasma Client to put/get raw bytes without pyarrow serialization. +* ARROW-3505 - [R] Read record batch and table +* ARROW-3515 - Introduce NumericTensor class +* ARROW-3529 - [Ruby] Import Red Parquet +* ARROW-3536 - [C++] Fast UTF8 validation functions +* ARROW-3537 - [Rust] Implement Tensor Type +* ARROW-3540 - [Rust] Incorporate BooleanArray into PrimitiveArray +* ARROW-3555 - [Plasma] Unify plasma client get function using metadata. +* ARROW-3567 - [Gandiva] [GLib] Add GLib bindings of Gandiva +* ARROW-3583 - [Python/Java] Create RecordBatch from VectorSchemaRoot +* ARROW-3592 - [Python] Get BinaryArray value as zero copy memory view +* ARROW-3608 - [R] Support for time32 and time64 array types +* ARROW-3610 - [C++] Add interface to turn stl\_allocator into arrow::MemoryPool +* ARROW-3630 - [Plasma] [GLib] Add GLib bindings of Plasma +* ARROW-3660 - [C++] Don't unnecessarily lock MemoryMappedFile for resizing in readonly files +* ARROW-3662 - [C++] Add a const overload to MemoryMappedFile::GetSize +* ARROW-3692 - [Gandiva] [Ruby] Add Ruby bindings of Gandiva +* ARROW-3723 - [Plasma] [Ruby] Add Ruby bindings of Plasma +* ARROW-3726 - [Rust] CSV Reader & Writer +* ARROW-3731 - [R] R API for reading and writing Parquet files +* ARROW-3738 - [C++] Add CSV conversion option to parse ISO8601-like timestamp strings +* ARROW-3741 - [R] Add support for arrow::compute::Cast to convert Arrow arrays from one type to another +* ARROW-3755 - [GLib] Support for CompressedInputStream, CompressedOutputStream +* ARROW-3760 - [R] Support Arrow CSV reader +* ARROW-3782 - [C++] Implement BufferedReader for C++ +* ARROW-3798 - [GLib] Add support for column type CSV read options +* ARROW-3807 - [R] Missing Field API +* ARROW-3823 - [R] + buffer.complex +* ARROW-3830 - [GLib] Add GArrowCodec +* ARROW-3842 - [R] RecordBatchStreamWriter api +* ARROW-3864 - [GLib] Add support for allow-float-truncate cast option +* ARROW-3900 - [GLib] Add garrow\_mutable\_buffer\_set\_data() +* ARROW-3912 - [Plasma][GLib] Add support for creating and referring objects +* ARROW-3916 - [Python] Support caller-provided filesystem in \`ParquetWriter\` constructor +* ARROW-3924 - [Packaging][Plasma] Add support for Plasma deb/rpm packages +* ARROW-3938 - [Packaging] Stop to refer java/pom.xml to get version information +* ARROW-3945 - [Website] Blog post about Gandiva code donation +* ARROW-3946 - [GLib] Add support for union +* ARROW-3959 - [Rust] Time and Timestamp Support +* ARROW-4028 - [Rust] Merge parquet-rs codebase +* ARROW-4112 - [Packaging][Gandiva] Add support for deb packages +* ARROW-4132 - [GLib] Add more GArrowTable constructors +* ARROW-4141 - [Ruby] Add support for creating schema from raw Ruby objects +* ARROW-4153 - [GLib] Add builder\_append\_value() for consistency +* ARROW-4154 - [GLib] Add GArrowDecimal128DataType +* ARROW-4161 - [GLib] Add GPlasmaClientOptions +* ARROW-4162 - [Ruby] Add support for creating data types from description +* ARROW-4166 - [Ruby] Add support for saving to and loading from buffer +* ARROW-4174 - [Ruby] Add support for building composite array from raw Ruby objects +* ARROW-4175 - [GLib] Add support for decimal compare operators +* ARROW-4183 - [Ruby] Add Arrow::Struct as an element of Arrow::StructArray +* ARROW-4184 - [Ruby] Add Arrow::RecordBatch#to\_table +* ARROW-4214 - [Ruby] Add support for building RecordBatch from raw Ruby objects +* ARROW-45 - [Python] Add unnest/flatten function for List types +* ARROW-554 - [C++] Implement functions to conform unequal dictionaries amongst multiple Arrow arrays +* ARROW-854 - [Format] Support sparse tensor + +## Sub-task + +* ARROW-3272 - [Java] Document checkstyle deviations from Google style guide +* ARROW-3273 - [Java] checkstyle - fix javadoc style +* ARROW-3323 - [Java] checkstyle - fix naming +* ARROW-3347 - [Rust] Implement PrimitiveArrayBuilder +* ARROW-3568 - [Packaging] Run pyarrow unittests for windows wheels +* ARROW-3569 - [Packaging] Run pyarrow unittests when building conda package +* ARROW-3588 - [Java] checkstyle - fix license +* ARROW-3616 - [Java] checkstyle - fix remaining coding checks +* ARROW-3664 - [Rust] Add benchmark for PrimitiveArrayBuilder +* ARROW-3665 - [Rust] Implement StructArrayBuilder +* ARROW-3713 - [Rust] Implement BinaryArrayBuilder +* ARROW-3891 - [Java] Remove Long.bitCount with simple bitmap operations +* ARROW-3939 - [Rust] Remove macro definition for ListArrayBuilder +* ARROW-3948 - [CI][GLib] Set timeout to Homebrew +* ARROW-4060 - [Rust] Add Parquet/Arrow schema converter +* ARROW-4075 - [Rust] Reuse array builder after calling finish() +* ARROW-4172 - [Rust] more consistent naming in array builders + +## Task + +* ARROW-2337 - [Scripts] Windows release verification script should use boost DSOs instead of static linkage +* ARROW-2535 - [Python] Provide pre-commit hooks that check flake8 +* ARROW-2560 - [Rust] The Rust README should include Rust-specific information on contributing +* ARROW-2653 - [C++] Refactor hash table support +* ARROW-2720 - [C++] Clean up cmake CXX\_STANDARD and PIC flag setting +* ARROW-3194 - [Java] Fix setValueCount in spitAndTransfer for variable width vectors +* ARROW-3383 - [Java] Run Gandiva tests in Travis CI +* ARROW-3384 - [Gandiva] Sync remaining commits from gandiva repo +* ARROW-3385 - [Java] [Gandiva] Deploy gandiva snapshot jars automatically +* ARROW-3427 - [C++] Add Windows support, Unix static libs for double-conversion package in conda-forge +* ARROW-3469 - [Gandiva] add travis entry for gandiva on OSX +* ARROW-3472 - [Gandiva] remove gandiva helpers library +* ARROW-3487 - [Gandiva] simplify NULL\_IF\_NULL functions that can return errors +* ARROW-3489 - [Gandiva] Support for in expressions +* ARROW-3501 - [Gandiva] Enable building with gcc 4.8.x on Ubuntu Trusty, similar distros +* ARROW-3519 - [Gandiva] Add support for functions that can return variable len output +* ARROW-3597 - [Gandiva] gandiva should integrate with ADD\_ARROW\_TEST for tests +* ARROW-3609 - [Gandiva] Move benchmark tests out of unit test +* ARROW-3701 - [Gandiva] Add support for decimal operations +* ARROW-3859 - [Java] Fix ComplexWriter backward incompatible change +* ARROW-3860 - [Gandiva] [C++] Add option to use -static-libstdc++ when building libgandiva\_jni.so +* ARROW-3867 - [Documentation] Uploading binary realase artifacts to Bintray +* ARROW-3970 - [Gandiva][C++] Remove unnecessary boost dependencies +* ARROW-3983 - [Gandiva][Crossbow] Use static boost while packaging +* ARROW-3993 - [JS] CI Jobs Failing +* ARROW-4039 - Update link to 'development.rst' page from Python README.md +* ARROW-4043 - [Packaging/Docker] Python tests on alpine miss pytest dependency +* ARROW-4044 - [Packaging/Python] Add hypothesis test dependency to pyarrow conda recipe +* ARROW-4045 - [Packaging/Python] Add hypothesis test dependency to wheel crossbow tests +* ARROW-4100 - [Gandiva][C++] Fix regex to ignore "." character +* ARROW-4148 - [CI/Python] Disable ORC on nightly Alpine builds +* ARROW-4151 - [Rust] Restructure project directories +* ARROW-4210 - [Python] Mention boost-cpp directly in the conda meta.yaml for pyarrow +* ARROW-4239 - [Release] Updating .deb package names in the prepare script failed to run on OSX +* ARROW-4241 - [Packaging] Disable crossbow conda OSX clang builds +* ARROW-4266 - [Python][CI] Disable ORC tests in dask integration test +* ARROW-4270 - [Packaging][Conda] Update xcode version and remove toolchain builds + +## Test + +* ARROW-4137 - [Rust] Move parquet code into a separate crate + +## Wish + +* ARROW-3248 - [C++] Arrow tests should have label "arrow" +* ARROW-3260 - [CI] Make linting a separate job +* ARROW-3844 - [C++] Remove ARROW\_USE\_SSE and ARROW\_SSE3 +* ARROW-3851 - [C++] "make check-format" is slow +* ARROW-4079 - [C++] Add machine benchmarks +* ARROW-4150 - [C++] Do not return buffers containing nullptr from internal allocations +* ARROW-4156 - [C++] xcodebuild failure for cmake generated project + # Apache Arrow 0.11.0 (08 October 2018) ## Bug @@ -2620,3 +3224,4 @@ * ARROW-260 - TestValueVector.testFixedVectorReallocation and testVariableVectorReallocation are flaky * ARROW-83 - Add basic test infrastructure for DecimalType + diff --git a/docs/Benchmarks.md b/CODE_OF_CONDUCT.md similarity index 69% rename from docs/Benchmarks.md rename to CODE_OF_CONDUCT.md index c84bf0dc1eb62..2efe740b77c50 100644 --- a/docs/Benchmarks.md +++ b/CODE_OF_CONDUCT.md @@ -16,14 +16,9 @@ specific language governing permissions and limitations under the License. --> -## Benchmark Requirements -The benchmarks are run using [asv][1] which is also their only requirement. +# Code of Conduct -## Running the benchmarks +* [Code of Conduct for The Apache Software Foundation][1] -To run the benchmarks, call `asv run --python=same`. You cannot use the -plain `asv run` command at the moment as asv cannot handle python packages -in subdirectories of a repository. - -[1]: https://asv.readthedocs.org/ +[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt index 5c9aaddc14ff8..ad2255d431066 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -681,7 +681,7 @@ See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- -The file cpp/src/arrow/util/date.h has the following license (MIT) +The file cpp/src/arrow/vendored/date.h has the following license (MIT) The MIT License (MIT) Copyright (c) 2015, 2016, 2017 Howard Hinnant @@ -736,7 +736,7 @@ SOFTWARE. -------------------------------------------------------------------------------- -The file cpp/src/util/string_view/string_view.hpp has the following license +The file cpp/src/arrow/vendored/string_view.hpp has the following license Boost Software License - Version 1.0 - August 17th, 2003 @@ -764,7 +764,7 @@ DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- -The files in cpp/src/arrow/util/xxhash/ have the following license +The files in cpp/src/arrow/vendored/xxhash/ have the following license (BSD 2-Clause License) xxHash Library @@ -795,3 +795,36 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - xxHash homepage: http://www.xxhash.com - xxHash source repository : https://github.com/Cyan4973/xxHash + +-------------------------------------------------------------------------------- + +The files in dev/tasks/conda-recipes/variants have the following license + +BSD 3-clause license +Copyright (c) 2015-2018, conda-forge +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/appveyor.yml b/appveyor.yml index 18ad9f5f56c5d..dbf13ff278dc7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,6 +22,7 @@ only_commits: # Skip commits not related to Python, C++ or Rust files: - appveyor.yml + - c_glib/ - ci/ - cpp/ - format/ @@ -34,6 +35,14 @@ cache: matrix: fast_finish: true + allow_failures: + # Can't build with 32-bit MinGW for now. + # See https://issues.apache.org/jira/browse/ARROW-4297 + - JOB: "MinGW32" + MINGW_PACKAGE_PREFIX: mingw-w64-i686 + MINGW_PREFIX: c:\msys64\mingw32 + MSYSTEM: MINGW32 + USE_CLCACHE: false environment: global: @@ -60,13 +69,21 @@ environment: GENERATOR: Ninja CONFIGURATION: "Release" BUILD_SCRIPT: "CMake_Build_Script" + - JOB: "MinGW32" + MINGW_PACKAGE_PREFIX: mingw-w64-i686 + MINGW_PREFIX: c:\msys64\mingw32 + MSYSTEM: MINGW32 + USE_CLCACHE: false + - JOB: "MinGW64" + MINGW_PACKAGE_PREFIX: mingw-w64-x86_64 + MINGW_PREFIX: c:\msys64\mingw64 + MSYSTEM: MINGW64 + USE_CLCACHE: false - JOB: "Rust" TARGET: x86_64-pc-windows-msvc USE_CLCACHE: false MSVC_DEFAULT_OPTIONS: ON - BOOST_ROOT: C:\Libraries\boost_1_67_0 - BOOST_LIBRARYDIR: C:\Libraries\boost_1_67_0\lib64-msvc-14.0 APPVEYOR_SAVE_CACHE_ON_ERROR: true install: diff --git a/c_glib/Dockerfile b/c_glib/Dockerfile index 5d64a5f154f62..7abfa17a6b678 100644 --- a/c_glib/Dockerfile +++ b/c_glib/Dockerfile @@ -17,9 +17,7 @@ FROM arrow:cpp -ENV DEBIAN_FRONTEND=noninteractive RUN apt-get -q install --no-install-recommends -y \ - tzdata \ ruby-dev \ pkg-config \ autoconf-archive \ @@ -27,7 +25,7 @@ RUN apt-get -q install --no-install-recommends -y \ libgirepository1.0-dev ADD c_glib/Gemfile /arrow/c_glib/ -RUN conda install -c conda-forge meson=0.47.1 && \ +RUN conda install meson=0.47.1 && \ conda clean --all && \ gem install bundler && \ bundle install --gemfile arrow/c_glib/Gemfile diff --git a/c_glib/Makefile.am b/c_glib/Makefile.am index 149894c8241c2..53bb57e411b0c 100644 --- a/c_glib/Makefile.am +++ b/c_glib/Makefile.am @@ -24,8 +24,7 @@ SUBDIRS = \ parquet-glib \ plasma-glib \ doc \ - example \ - tool + example EXTRA_DIST = \ Gemfile \ diff --git a/c_glib/arrow-cuda-glib/cuda.cpp b/c_glib/arrow-cuda-glib/cuda.cpp index 3f82f8fa806cb..9679cc0ff7fd8 100644 --- a/c_glib/arrow-cuda-glib/cuda.cpp +++ b/c_glib/arrow-cuda-glib/cuda.cpp @@ -648,7 +648,7 @@ garrow_cuda_ipc_memory_handle_new(const guint8 *data, * * Returns: (transfer full): A newly created #GArrowBuffer on success, * %NULL on error. The buffer has serialized @handle. The serialized - * @handle can be deserialized by garrow_gpu_cuda_ipc_memory_handle_new() + * @handle can be deserialized by garrow_cuda_ipc_memory_handle_new() * in other process. * * Since: 0.8.0 diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index bf97168eb81d7..a296595571438 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -59,7 +59,7 @@ libarrow_glib_la_headers = \ composite-array.h \ composite-data-type.h \ data-type.h \ - decimal.h \ + decimal128.h \ error.h \ field.h \ gobject-type.h \ @@ -110,7 +110,7 @@ libarrow_glib_la_sources = \ column.cpp \ composite-array.cpp \ composite-data-type.cpp \ - decimal.cpp \ + decimal128.cpp \ error.cpp \ field.cpp \ record-batch.cpp \ @@ -155,7 +155,7 @@ libarrow_glib_la_cpp_headers = \ codec.hpp \ column.hpp \ data-type.hpp \ - decimal.hpp \ + decimal128.hpp \ error.hpp \ field.hpp \ record-batch.hpp \ diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index a5c75790de939..095c68d87689d 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -23,16 +23,16 @@ #include #include +#include #include #include -#include template gboolean -garrow_array_builder_append(GArrowArrayBuilder *builder, - VALUE value, - GError **error, - const gchar *context) +garrow_array_builder_append_value(GArrowArrayBuilder *builder, + VALUE value, + GError **error, + const gchar *context) { auto arrow_builder = static_cast(garrow_array_builder_get_raw(builder)); @@ -446,17 +446,38 @@ garrow_boolean_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_boolean_array_builder_append_value() instead. */ gboolean garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error) { - return garrow_array_builder_append + return garrow_boolean_array_builder_append_value(builder, value, error); +} + +/** + * garrow_boolean_array_builder_append_value: + * @builder: A #GArrowBooleanArrayBuilder. + * @value: A boolean value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_boolean_array_builder_append_value(GArrowBooleanArrayBuilder *builder, + gboolean value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), static_cast(value), error, - "[boolean-array-builder][append]"); + "[boolean-array-builder][append-value]"); } /** @@ -583,17 +604,38 @@ garrow_int_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.6.0 + * + * Deprecated: 0.12.0: + * Use garrow_int_array_builder_append_value() instead. */ gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_int_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int_array_builder_append_value: + * @builder: A #GArrowIntArrayBuilder. + * @value: A int value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int_array_builder_append_value(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int-array-builder][append]"); + "[int-array-builder][append-value]"); } /** @@ -718,17 +760,38 @@ garrow_uint_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.8.0 + * + * Deprecated: 0.12.0: + * Use garrow_uint_array_builder_append_value() instead. */ gboolean garrow_uint_array_builder_append(GArrowUIntArrayBuilder *builder, guint64 value, GError **error) { - return garrow_array_builder_append + return garrow_uint_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint_array_builder_append_value: + * @builder: A #GArrowUIntArrayBuilder. + * @value: A unsigned int value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint_array_builder_append_value(GArrowUIntArrayBuilder *builder, + guint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint-array-builder][append]"); + "[uint-array-builder][append-value]"); } /** @@ -848,17 +911,38 @@ garrow_int8_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int8_array_builder_append_value() instead. */ gboolean garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error) { - return garrow_array_builder_append + return garrow_int8_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int8_array_builder_append_value: + * @builder: A #GArrowInt8ArrayBuilder. + * @value: A int8 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int8_array_builder_append_value(GArrowInt8ArrayBuilder *builder, + gint8 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int8-array-builder][append]"); + "[int8-array-builder][append-value]"); } /** @@ -976,17 +1060,38 @@ garrow_uint8_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint8_array_builder_append_value() instead. */ gboolean garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error) { - return garrow_array_builder_append + return garrow_uint8_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint8_array_builder_append_value: + * @builder: A #GArrowUInt8ArrayBuilder. + * @value: An uint8 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint8_array_builder_append_value(GArrowUInt8ArrayBuilder *builder, + guint8 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint8-array-builder][append]"); + "[uint8-array-builder][append-value]"); } /** @@ -1104,17 +1209,38 @@ garrow_int16_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int16_array_builder_append_value() instead. */ gboolean garrow_int16_array_builder_append(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error) { - return garrow_array_builder_append + return garrow_int16_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int16_array_builder_append_value: + * @builder: A #GArrowInt16ArrayBuilder. + * @value: A int16 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int16_array_builder_append_value(GArrowInt16ArrayBuilder *builder, + gint16 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int16-array-builder][append]"); + "[int16-array-builder][append-value]"); } /** @@ -1232,17 +1358,38 @@ garrow_uint16_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint16_array_builder_append_value() instead. */ gboolean garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error) { - return garrow_array_builder_append + return garrow_uint16_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint16_array_builder_append_value: + * @builder: A #GArrowUInt16ArrayBuilder. + * @value: An uint16 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint16_array_builder_append_value(GArrowUInt16ArrayBuilder *builder, + guint16 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint16-array-builder][append]"); + "[uint16-array-builder][append-value]"); } /** @@ -1360,17 +1507,38 @@ garrow_int32_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int32_array_builder_append_value() instead. */ gboolean garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error) { - return garrow_array_builder_append + return garrow_int32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int32_array_builder_append_value: + * @builder: A #GArrowInt32ArrayBuilder. + * @value: A int32 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int32_array_builder_append_value(GArrowInt32ArrayBuilder *builder, + gint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int32-array-builder][append]"); + "[int32-array-builder][append-value]"); } /** @@ -1488,17 +1656,38 @@ garrow_uint32_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint32_array_builder_append_value() instead. */ gboolean garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error) { - return garrow_array_builder_append + return garrow_uint32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint32_array_builder_append_value: + * @builder: A #GArrowUInt32ArrayBuilder. + * @value: An uint32 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint32_array_builder_append_value(GArrowUInt32ArrayBuilder *builder, + guint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint32-array-builder][append]"); + "[uint32-array-builder][append-value]"); } /** @@ -1616,17 +1805,38 @@ garrow_int64_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int64_array_builder_append_value() instead. */ gboolean garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_int64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int64_array_builder_append_value: + * @builder: A #GArrowInt64ArrayBuilder. + * @value: A int64 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int64_array_builder_append_value(GArrowInt64ArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int64-array-builder][append]"); + "[int64-array-builder][append-value]"); } /** @@ -1744,17 +1954,38 @@ garrow_uint64_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint64_array_builder_append_value() instead. */ gboolean garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error) { - return garrow_array_builder_append + return garrow_uint64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint64_array_builder_append_value: + * @builder: A #GArrowUInt64ArrayBuilder. + * @value: An uint64 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint64_array_builder_append_value(GArrowUInt64ArrayBuilder *builder, + guint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint64-array-builder][append]"); + "[uint64-array-builder][append-value]"); } /** @@ -1872,17 +2103,38 @@ garrow_float_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_float_array_builder_append_value() instead. */ gboolean garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, gfloat value, GError **error) { - return garrow_array_builder_append + return garrow_float_array_builder_append_value(builder, value, error); +} + +/** + * garrow_float_array_builder_append_value: + * @builder: A #GArrowFloatArrayBuilder. + * @value: A float value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_float_array_builder_append_value(GArrowFloatArrayBuilder *builder, + gfloat value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[float-array-builder][append]"); + "[float-array-builder][append-value]"); } /** @@ -2000,17 +2252,38 @@ garrow_double_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_double_array_builder_append_value() instead. */ gboolean garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error) { - return garrow_array_builder_append + return garrow_double_array_builder_append_value(builder, value, error); +} + +/** + * garrow_double_array_builder_append_value: + * @builder: A #GArrowDoubleArrayBuilder. + * @value: A double value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_double_array_builder_append_value(GArrowDoubleArrayBuilder *builder, + gdouble value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[double-array-builder][append]"); + "[double-array-builder][append-value]"); } /** @@ -2129,19 +2402,44 @@ garrow_binary_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_binary_array_builder_append_value() instead. */ gboolean garrow_binary_array_builder_append(GArrowBinaryArrayBuilder *builder, const guint8 *value, gint32 length, GError **error) +{ + return garrow_binary_array_builder_append_value(builder, value, length, error); +} + +/** + * garrow_binary_array_builder_append_value: + * @builder: A #GArrowBinaryArrayBuilder. + * @value: (array length=length): A binary value. + * @length: A value length. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_binary_array_builder_append_value(GArrowBinaryArrayBuilder *builder, + const guint8 *value, + gint32 length, + GError **error) { auto arrow_builder = static_cast( garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); auto status = arrow_builder->Append(value, length); - return garrow_error_check(error, status, "[binary-array-builder][append]"); + return garrow_error_check(error, + status, + "[binary-array-builder][append-value]"); } /** @@ -2197,11 +2495,32 @@ garrow_string_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_string_array_builder_append_value() instead. */ gboolean garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, const gchar *value, GError **error) +{ + return garrow_string_array_builder_append_value(builder, value, error); +} + +/** + * garrow_string_array_builder_append_value: + * @builder: A #GArrowStringArrayBuilder. + * @value: A string value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_string_array_builder_append_value(GArrowStringArrayBuilder *builder, + const gchar *value, + GError **error) { auto arrow_builder = static_cast( @@ -2209,7 +2528,9 @@ garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, auto status = arrow_builder->Append(value, static_cast(strlen(value))); - return garrow_error_check(error, status, "[string-array-builder][append]"); + return garrow_error_check(error, + status, + "[string-array-builder][append-value]"); } /** @@ -2290,17 +2611,38 @@ garrow_date32_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_date32_array_builder_append_value() instead. */ gboolean garrow_date32_array_builder_append(GArrowDate32ArrayBuilder *builder, gint32 value, GError **error) { - return garrow_array_builder_append + return garrow_date32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_date32_array_builder_append_value: + * @builder: A #GArrowDate32ArrayBuilder. + * @value: The number of days since UNIX epoch in signed 32bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_date32_array_builder_append_value(GArrowDate32ArrayBuilder *builder, + gint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[date32-array-builder][append]"); + "[date32-array-builder][append-value]"); } /** @@ -2425,17 +2767,38 @@ garrow_date64_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_date64_array_builder_append_value() instead. */ gboolean garrow_date64_array_builder_append(GArrowDate64ArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_date64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_date64_array_builder_append_value: + * @builder: A #GArrowDate64ArrayBuilder. + * @value: The number of milliseconds since UNIX epoch in signed 64bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_date64_array_builder_append_value(GArrowDate64ArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[date64-array-builder][append]"); + "[date64-array-builder][append-value]"); } /** @@ -2562,17 +2925,38 @@ garrow_timestamp_array_builder_new(GArrowTimestampDataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_timestamp_array_builder_append_value() instead. */ gboolean garrow_timestamp_array_builder_append(GArrowTimestampArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_timestamp_array_builder_append_value(builder, value, error); +} + +/** + * garrow_timestamp_array_builder_append_value: + * @builder: A #GArrowTimestampArrayBuilder. + * @value: The number of milliseconds since UNIX epoch in signed 64bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_timestamp_array_builder_append_value(GArrowTimestampArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[timestamp-array-builder][append]"); + "[timestamp-array-builder][append-value]"); } /** @@ -2699,17 +3083,38 @@ garrow_time32_array_builder_new(GArrowTime32DataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_time32_array_builder_append_value() instead. */ gboolean garrow_time32_array_builder_append(GArrowTime32ArrayBuilder *builder, gint32 value, GError **error) { - return garrow_array_builder_append + return garrow_time32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_time32_array_builder_append_value: + * @builder: A #GArrowTime32ArrayBuilder. + * @value: The number of days since UNIX epoch in signed 32bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_time32_array_builder_append_value(GArrowTime32ArrayBuilder *builder, + gint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[time32-array-builder][append]"); + "[time32-array-builder][append-value]"); } /** @@ -2836,17 +3241,38 @@ garrow_time64_array_builder_new(GArrowTime64DataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_time64_array_builder_append_value() instead. */ gboolean garrow_time64_array_builder_append(GArrowTime64ArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_time64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_time64_array_builder_append_value: + * @builder: A #GArrowTime64ArrayBuilder. + * @value: The number of milliseconds since UNIX epoch in signed 64bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_time64_array_builder_append_value(GArrowTime64ArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[time64-array-builder][append]"); + "[time64-array-builder][append-value]"); } /** @@ -3047,17 +3473,72 @@ garrow_list_array_builder_new(GArrowListDataType *data_type, * g_object_unref(array); * } * ]| + * + * Deprecated: 0.12.0: + * Use garrow_list_array_builder_append_value() instead. */ gboolean garrow_list_array_builder_append(GArrowListArrayBuilder *builder, GError **error) +{ + return garrow_list_array_builder_append_value(builder, error); +} + +/** + * garrow_list_array_builder_append_value: + * @builder: A #GArrowListArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * It appends a new list element. To append a new list element, you + * need to call this function then append list element values to + * `value_builder`. `value_builder` is the #GArrowArrayBuilder + * specified to constructor. You can get `value_builder` by + * garrow_list_array_builder_get_value_builder(). + * + * |[ + * GArrowInt8ArrayBuilder *value_builder; + * GArrowListArrayBuilder *builder; + * + * value_builder = garrow_int8_array_builder_new(); + * builder = garrow_list_array_builder_new(value_builder, NULL); + * + * // Start 0th list element: [1, 0, -1] + * garrow_list_array_builder_append(builder, NULL); + * garrow_int8_array_builder_append(value_builder, 1); + * garrow_int8_array_builder_append(value_builder, 0); + * garrow_int8_array_builder_append(value_builder, -1); + * + * // Start 1st list element: [-29, 29] + * garrow_list_array_builder_append(builder, NULL); + * garrow_int8_array_builder_append(value_builder, -29); + * garrow_int8_array_builder_append(value_builder, 29); + * + * { + * // [[1, 0, -1], [-29, 29]] + * GArrowArray *array = garrow_array_builder_finish(builder); + * // Now, builder is needless. + * g_object_unref(builder); + * g_object_unref(value_builder); + * + * // Use array... + * g_object_unref(array); + * } + * ]| + * + * Since: 0.12.0 + */ +gboolean +garrow_list_array_builder_append_value(GArrowListArrayBuilder *builder, + GError **error) { auto arrow_builder = static_cast( garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); auto status = arrow_builder->Append(); - return garrow_error_check(error, status, "[list-array-builder][append]"); + return garrow_error_check(error, status, "[list-array-builder][append-value]"); } /** @@ -3195,17 +3676,49 @@ garrow_struct_array_builder_new(GArrowStructDataType *data_type, * |[ * // TODO * ]| + * + * Deprecated: 0.12.0: + * Use garrow_struct_array_builder_append_value() instead. */ gboolean garrow_struct_array_builder_append(GArrowStructArrayBuilder *builder, GError **error) +{ + return garrow_struct_array_builder_append_value(builder, error); +} + +/** + * garrow_struct_array_builder_append_value: + * @builder: A #GArrowStructArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * It appends a new struct element. To append a new struct element, + * you need to call this function then append struct element field + * values to all `field_builder`s. `field_value`s are the + * #GArrowArrayBuilder specified to constructor. You can get + * `field_builder` by garrow_struct_array_builder_get_field_builder() + * or garrow_struct_array_builder_get_field_builders(). + * + * |[ + * // TODO + * ]| + * + * Since: 0.12.0 + */ +gboolean +garrow_struct_array_builder_append_value(GArrowStructArrayBuilder *builder, + GError **error) { auto arrow_builder = static_cast( garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); auto status = arrow_builder->Append(); - return garrow_error_check(error, status, "[struct-array-builder][append]"); + return garrow_error_check(error, + status, + "[struct-array-builder][append-value]"); } /** @@ -3290,14 +3803,14 @@ garrow_decimal128_array_builder_class_init(GArrowDecimal128ArrayBuilderClass *kl /** * garrow_decimal128_array_builder_new: - * @data_type: #GArrowDecimalDataType for the decimal. + * @data_type: #GArrowDecimal128DataType for the decimal. * * Returns: A newly created #GArrowDecimal128ArrayBuilder. * * Since: 0.10.0 */ GArrowDecimal128ArrayBuilder * -garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type) +garrow_decimal128_array_builder_new(GArrowDecimal128DataType *data_type) { auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); auto builder = garrow_array_builder_new(arrow_data_type, @@ -3315,18 +3828,60 @@ garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.10.0 + * + * Deprecated: 0.12.0: + * Use garrow_decimal128_array_builder_append_value() instead. */ gboolean garrow_decimal128_array_builder_append(GArrowDecimal128ArrayBuilder *builder, GArrowDecimal128 *value, GError **error) +{ + return garrow_decimal128_array_builder_append_value(builder, value, error); +} + +/** + * garrow_decimal128_array_builder_append_value: + * @builder: A #GArrowDecimal128ArrayBuilder. + * @value: A decimal value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_array_builder_append_value(GArrowDecimal128ArrayBuilder *builder, + GArrowDecimal128 *value, + GError **error) { auto arrow_decimal = garrow_decimal128_get_raw(value); - return garrow_array_builder_append + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), *arrow_decimal, error, - "[decimal128-array-builder][append]"); + "[decimal128-array-builder][append-value]"); +} + +/** + * garrow_decimal128_array_builder_append_null: + * @builder: A #GArrowDecimal128ArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * It appends a new NULL element. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_array_builder_append_null(GArrowDecimal128ArrayBuilder *builder, + GError **error) +{ + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[decimal128-array-builder][append-null]"); } G_END_DECLS diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index db340b70ab77c..bc0a99429b8f1 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -20,8 +20,7 @@ #pragma once #include -#include -#include +#include G_BEGIN_DECLS @@ -90,9 +89,16 @@ GType garrow_boolean_array_builder_get_type(void) G_GNUC_CONST; GArrowBooleanArrayBuilder *garrow_boolean_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_boolean_array_builder_append_value) gboolean garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_boolean_array_builder_append_value(GArrowBooleanArrayBuilder *builder, + gboolean value, + GError **error); gboolean garrow_boolean_array_builder_append_values(GArrowBooleanArrayBuilder *builder, const gboolean *values, gint64 values_length, @@ -150,9 +156,16 @@ GType garrow_int_array_builder_get_type(void) G_GNUC_CONST; GArrowIntArrayBuilder *garrow_int_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int_array_builder_append_value) gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int_array_builder_append_value(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_int_array_builder_append_values(GArrowIntArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -179,9 +192,16 @@ struct _GArrowUIntArrayBuilderClass GArrowUIntArrayBuilder *garrow_uint_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint_array_builder_append_value) gboolean garrow_uint_array_builder_append(GArrowUIntArrayBuilder *builder, guint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint_array_builder_append_value(GArrowUIntArrayBuilder *builder, + guint64 value, + GError **error); gboolean garrow_uint_array_builder_append_values(GArrowUIntArrayBuilder *builder, const guint64 *values, gint64 values_length, @@ -239,9 +259,16 @@ GType garrow_int8_array_builder_get_type(void) G_GNUC_CONST; GArrowInt8ArrayBuilder *garrow_int8_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int8_array_builder_append_value) gboolean garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int8_array_builder_append_value(GArrowInt8ArrayBuilder *builder, + gint8 value, + GError **error); gboolean garrow_int8_array_builder_append_values(GArrowInt8ArrayBuilder *builder, const gint8 *values, gint64 values_length, @@ -299,9 +326,16 @@ GType garrow_uint8_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt8ArrayBuilder *garrow_uint8_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint8_array_builder_append_value) gboolean garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint8_array_builder_append_value(GArrowUInt8ArrayBuilder *builder, + guint8 value, + GError **error); gboolean garrow_uint8_array_builder_append_values(GArrowUInt8ArrayBuilder *builder, const guint8 *values, gint64 values_length, @@ -359,9 +393,16 @@ GType garrow_int16_array_builder_get_type(void) G_GNUC_CONST; GArrowInt16ArrayBuilder *garrow_int16_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int16_array_builder_append_value) gboolean garrow_int16_array_builder_append(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int16_array_builder_append_value(GArrowInt16ArrayBuilder *builder, + gint16 value, + GError **error); gboolean garrow_int16_array_builder_append_values(GArrowInt16ArrayBuilder *builder, const gint16 *values, gint64 values_length, @@ -419,9 +460,16 @@ GType garrow_uint16_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt16ArrayBuilder *garrow_uint16_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint16_array_builder_append_value) gboolean garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint16_array_builder_append_value(GArrowUInt16ArrayBuilder *builder, + guint16 value, + GError **error); gboolean garrow_uint16_array_builder_append_values(GArrowUInt16ArrayBuilder *builder, const guint16 *values, gint64 values_length, @@ -479,9 +527,16 @@ GType garrow_int32_array_builder_get_type(void) G_GNUC_CONST; GArrowInt32ArrayBuilder *garrow_int32_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int32_array_builder_append_value) gboolean garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int32_array_builder_append_value(GArrowInt32ArrayBuilder *builder, + gint32 value, + GError **error); gboolean garrow_int32_array_builder_append_values(GArrowInt32ArrayBuilder *builder, const gint32 *values, gint64 values_length, @@ -539,9 +594,16 @@ GType garrow_uint32_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt32ArrayBuilder *garrow_uint32_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint32_array_builder_append_value) gboolean garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint32_array_builder_append_value(GArrowUInt32ArrayBuilder *builder, + guint32 value, + GError **error); gboolean garrow_uint32_array_builder_append_values(GArrowUInt32ArrayBuilder *builder, const guint32 *values, gint64 values_length, @@ -599,9 +661,16 @@ GType garrow_int64_array_builder_get_type(void) G_GNUC_CONST; GArrowInt64ArrayBuilder *garrow_int64_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int64_array_builder_append_value) gboolean garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int64_array_builder_append_value(GArrowInt64ArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_int64_array_builder_append_values(GArrowInt64ArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -659,9 +728,16 @@ GType garrow_uint64_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt64ArrayBuilder *garrow_uint64_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint64_array_builder_append_value) gboolean garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint64_array_builder_append_value(GArrowUInt64ArrayBuilder *builder, + guint64 value, + GError **error); gboolean garrow_uint64_array_builder_append_values(GArrowUInt64ArrayBuilder *builder, const guint64 *values, gint64 values_length, @@ -719,9 +795,16 @@ GType garrow_float_array_builder_get_type(void) G_GNUC_CONST; GArrowFloatArrayBuilder *garrow_float_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_float_array_builder_append_value) gboolean garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, gfloat value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_float_array_builder_append_value(GArrowFloatArrayBuilder *builder, + gfloat value, + GError **error); gboolean garrow_float_array_builder_append_values(GArrowFloatArrayBuilder *builder, const gfloat *values, gint64 values_length, @@ -779,9 +862,16 @@ GType garrow_double_array_builder_get_type(void) G_GNUC_CONST; GArrowDoubleArrayBuilder *garrow_double_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_double_array_builder_append_value) gboolean garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_double_array_builder_append_value(GArrowDoubleArrayBuilder *builder, + gdouble value, + GError **error); gboolean garrow_double_array_builder_append_values(GArrowDoubleArrayBuilder *builder, const gdouble *values, gint64 values_length, @@ -839,10 +929,18 @@ GType garrow_binary_array_builder_get_type(void) G_GNUC_CONST; GArrowBinaryArrayBuilder *garrow_binary_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_binary_array_builder_append_value) gboolean garrow_binary_array_builder_append(GArrowBinaryArrayBuilder *builder, const guint8 *value, gint32 length, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_binary_array_builder_append_value(GArrowBinaryArrayBuilder *builder, + const guint8 *value, + gint32 length, + GError **error); gboolean garrow_binary_array_builder_append_null(GArrowBinaryArrayBuilder *builder, GError **error); @@ -891,9 +989,16 @@ GType garrow_string_array_builder_get_type(void) G_GNUC_CONST; GArrowStringArrayBuilder *garrow_string_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_string_array_builder_append_value) gboolean garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, const gchar *value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_string_array_builder_append_value(GArrowStringArrayBuilder *builder, + const gchar *value, + GError **error); gboolean garrow_string_array_builder_append_values(GArrowStringArrayBuilder *builder, const gchar **values, gint64 values_length, @@ -946,9 +1051,16 @@ GType garrow_date32_array_builder_get_type(void) G_GNUC_CONST; GArrowDate32ArrayBuilder *garrow_date32_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_date32_array_builder_append_value) gboolean garrow_date32_array_builder_append(GArrowDate32ArrayBuilder *builder, gint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_date32_array_builder_append_value(GArrowDate32ArrayBuilder *builder, + gint32 value, + GError **error); gboolean garrow_date32_array_builder_append_values(GArrowDate32ArrayBuilder *builder, const gint32 *values, gint64 values_length, @@ -1006,9 +1118,16 @@ GType garrow_date64_array_builder_get_type(void) G_GNUC_CONST; GArrowDate64ArrayBuilder *garrow_date64_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_date64_array_builder_append_value) gboolean garrow_date64_array_builder_append(GArrowDate64ArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_date64_array_builder_append_value(GArrowDate64ArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_date64_array_builder_append_values(GArrowDate64ArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -1067,9 +1186,16 @@ GType garrow_timestamp_array_builder_get_type(void) G_GNUC_CONST; GArrowTimestampArrayBuilder * garrow_timestamp_array_builder_new(GArrowTimestampDataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_timestamp_array_builder_append_value) gboolean garrow_timestamp_array_builder_append(GArrowTimestampArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_timestamp_array_builder_append_value(GArrowTimestampArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_timestamp_array_builder_append_values(GArrowTimestampArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -1127,9 +1253,16 @@ GType garrow_time32_array_builder_get_type(void) G_GNUC_CONST; GArrowTime32ArrayBuilder *garrow_time32_array_builder_new(GArrowTime32DataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_time32_array_builder_append_value) gboolean garrow_time32_array_builder_append(GArrowTime32ArrayBuilder *builder, gint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_time32_array_builder_append_value(GArrowTime32ArrayBuilder *builder, + gint32 value, + GError **error); gboolean garrow_time32_array_builder_append_values(GArrowTime32ArrayBuilder *builder, const gint32 *values, gint64 values_length, @@ -1187,9 +1320,16 @@ GType garrow_time64_array_builder_get_type(void) G_GNUC_CONST; GArrowTime64ArrayBuilder *garrow_time64_array_builder_new(GArrowTime64DataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_time64_array_builder_append_value) gboolean garrow_time64_array_builder_append(GArrowTime64ArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_time64_array_builder_append_value(GArrowTime64ArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_time64_array_builder_append_values(GArrowTime64ArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -1248,8 +1388,14 @@ GType garrow_list_array_builder_get_type(void) G_GNUC_CONST; GArrowListArrayBuilder *garrow_list_array_builder_new(GArrowListDataType *data_type, GError **error); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_list_array_builder_append_value) gboolean garrow_list_array_builder_append(GArrowListArrayBuilder *builder, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_list_array_builder_append_value(GArrowListArrayBuilder *builder, + GError **error); gboolean garrow_list_array_builder_append_null(GArrowListArrayBuilder *builder, GError **error); @@ -1301,8 +1447,14 @@ GType garrow_struct_array_builder_get_type(void) G_GNUC_CONST; GArrowStructArrayBuilder *garrow_struct_array_builder_new(GArrowStructDataType *data_type, GError **error); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_struct_array_builder_append_value) gboolean garrow_struct_array_builder_append(GArrowStructArrayBuilder *builder, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_struct_array_builder_append_value(GArrowStructArrayBuilder *builder, + GError **error); gboolean garrow_struct_array_builder_append_null(GArrowStructArrayBuilder *builder, GError **error); @@ -1322,10 +1474,20 @@ struct _GArrowDecimal128ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; -GArrowDecimal128ArrayBuilder *garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type); +GArrowDecimal128ArrayBuilder *garrow_decimal128_array_builder_new(GArrowDecimal128DataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_array_builder_append_value) gboolean garrow_decimal128_array_builder_append(GArrowDecimal128ArrayBuilder *builder, GArrowDecimal128 *value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_array_builder_append_value(GArrowDecimal128ArrayBuilder *builder, + GArrowDecimal128 *value, + GError **error); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_array_builder_append_null(GArrowDecimal128ArrayBuilder *builder, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index fef43a0285e25..9aebd9cb8957a 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -22,12 +22,12 @@ #endif #include +#include #include #include -#include +#include #include #include -#include #include diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index cd3aa97679b5d..b6c5705fb070b 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -66,6 +66,8 @@ G_BEGIN_DECLS * * #GArrowBinaryDataType is a class for binary data type. * + * #GArrowFixedSizeBinaryDataType is a class for fixed-size binary data type. + * * #GArrowStringDataType is a class for UTF-8 encoded string data * type. * @@ -85,7 +87,9 @@ G_BEGIN_DECLS * #GArrowTime64DataType is a class for the number of microseconds or * nanoseconds since midnight in 64-bit signed integer data type. * - * #GArrowDecimalDataType is a class for 128-bit decimal data type. + * #GArrowDecimalDataType is a base class for decimal data type. + * + * #GArrowDecimal128DataType is a class for 128-bit decimal data type. */ typedef struct GArrowDataTypePrivate_ { @@ -237,7 +241,7 @@ garrow_fixed_width_data_type_class_init(GArrowFixedWidthDataTypeClass *klass) } /** - * garrow_fixed_width_data_type_get_id: + * garrow_fixed_width_data_type_get_bit_width: * @data_type: A #GArrowFixedWidthDataType. * * Returns: The number of bits for one data. @@ -714,6 +718,59 @@ garrow_binary_data_type_new(void) } +G_DEFINE_TYPE(GArrowFixedSizeBinaryDataType, + garrow_fixed_size_binary_data_type, + GARROW_TYPE_FIXED_WIDTH_DATA_TYPE) + +static void +garrow_fixed_size_binary_data_type_init(GArrowFixedSizeBinaryDataType *object) +{ +} + +static void +garrow_fixed_size_binary_data_type_class_init(GArrowFixedSizeBinaryDataTypeClass *klass) +{ +} + +/** + * garrow_fixed_size_binary_data_type: + * @byte_width: The byte width. + * + * Returns: The newly created fixed-size binary data type. + * + * Since: 0.12.0 + */ +GArrowFixedSizeBinaryDataType * +garrow_fixed_size_binary_data_type_new(gint32 byte_width) +{ + auto arrow_fixed_size_binary_data_type = arrow::fixed_size_binary(byte_width); + + auto fixed_size_binary_data_type = + GARROW_FIXED_SIZE_BINARY_DATA_TYPE(g_object_new(GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE, + "data-type", &arrow_fixed_size_binary_data_type, + NULL)); + return fixed_size_binary_data_type; +} + +/** + * garrow_fixed_size_binary_data_type_get_byte_width: + * @data_type: A #GArrowFixedSizeBinaryDataType. + * + * Returns: The number of bytes for one data. + * + * Since: 0.12.0 + */ +gint32 +garrow_fixed_size_binary_data_type_get_byte_width(GArrowFixedSizeBinaryDataType *data_type) +{ + const auto arrow_data_type = + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + const auto arrow_fixed_size_binary_type = + std::static_pointer_cast(arrow_data_type); + return arrow_fixed_size_binary_type->byte_width(); +} + + G_DEFINE_TYPE(GArrowStringDataType, garrow_string_data_type, GARROW_TYPE_DATA_TYPE) @@ -1040,9 +1097,9 @@ garrow_time64_data_type_new(GArrowTimeUnit unit, GError **error) } -G_DEFINE_TYPE(GArrowDecimalDataType, - garrow_decimal_data_type, - GARROW_TYPE_DATA_TYPE) +G_DEFINE_ABSTRACT_TYPE(GArrowDecimalDataType, + garrow_decimal_data_type, + GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE) static void garrow_decimal_data_type_init(GArrowDecimalDataType *object) @@ -1062,18 +1119,16 @@ garrow_decimal_data_type_class_init(GArrowDecimalDataTypeClass *klass) * Returns: The newly created decimal data type. * * Since: 0.10.0 + * + * Deprecated: 0.12.0: + * Use garrow_decimal128_data_type_new() instead. */ GArrowDecimalDataType * garrow_decimal_data_type_new(gint32 precision, gint32 scale) { - auto arrow_data_type = arrow::decimal(precision, scale); - - GArrowDecimalDataType *data_type = - GARROW_DECIMAL_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL_DATA_TYPE, - "data-type", &arrow_data_type, - NULL)); - return data_type; + auto decimal128_data_type = garrow_decimal128_data_type_new(precision, scale); + return GARROW_DECIMAL_DATA_TYPE(decimal128_data_type); } /** @@ -1112,6 +1167,43 @@ garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type) return arrow_decimal_type->scale(); } + +G_DEFINE_TYPE(GArrowDecimal128DataType, + garrow_decimal128_data_type, + GARROW_TYPE_DECIMAL_DATA_TYPE) + +static void +garrow_decimal128_data_type_init(GArrowDecimal128DataType *object) +{ +} + +static void +garrow_decimal128_data_type_class_init(GArrowDecimal128DataTypeClass *klass) +{ +} + +/** + * garrow_decimal128_data_type_new: + * @precision: The precision of decimal data. + * @scale: The scale of decimal data. + * + * Returns: The newly created 128-bit decimal data type. + * + * Since: 0.12.0 + */ +GArrowDecimal128DataType * +garrow_decimal128_data_type_new(gint32 precision, + gint32 scale) +{ + auto arrow_data_type = arrow::decimal(precision, scale); + + auto data_type = + GARROW_DECIMAL128_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL128_DATA_TYPE, + "data-type", &arrow_data_type, + NULL)); + return data_type; +} + G_END_DECLS GArrowDataType * @@ -1160,6 +1252,9 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::BINARY: type = GARROW_TYPE_BINARY_DATA_TYPE; break; + case arrow::Type::type::FIXED_SIZE_BINARY: + type = GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE; + break; case arrow::Type::type::STRING: type = GARROW_TYPE_STRING_DATA_TYPE; break; @@ -1199,7 +1294,7 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) type = GARROW_TYPE_DICTIONARY_DATA_TYPE; break; case arrow::Type::type::DECIMAL: - type = GARROW_TYPE_DECIMAL_DATA_TYPE; + type = GARROW_TYPE_DECIMAL128_DATA_TYPE; break; default: type = GARROW_TYPE_DATA_TYPE; diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index 45fddba34d4bc..d18958265748d 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -19,9 +19,9 @@ #pragma once -#include +#include #include -#include +#include G_BEGIN_DECLS @@ -338,6 +338,25 @@ GType garrow_binary_data_type_get_type (void) G_GNUC_CONST; GArrowBinaryDataType *garrow_binary_data_type_new (void); +#define GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE (garrow_fixed_size_binary_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryDataType, + garrow_fixed_size_binary_data_type, + GARROW, + FIXED_SIZE_BINARY_DATA_TYPE, + GArrowDataType) +struct _GArrowFixedSizeBinaryDataTypeClass +{ + GArrowFixedWidthDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_12 +GArrowFixedSizeBinaryDataType * +garrow_fixed_size_binary_data_type_new(gint32 byte_width); +GARROW_AVAILABLE_IN_0_12 +gint32 +garrow_fixed_size_binary_data_type_get_byte_width(GArrowFixedSizeBinaryDataType *data_type); + + #define GARROW_TYPE_STRING_DATA_TYPE \ (garrow_string_data_type_get_type()) #define GARROW_STRING_DATA_TYPE(obj) \ @@ -655,15 +674,34 @@ G_DECLARE_DERIVABLE_TYPE(GArrowDecimalDataType, garrow_decimal_data_type, GARROW, DECIMAL_DATA_TYPE, - GArrowDataType) + GArrowFixedSizeBinaryDataType) struct _GArrowDecimalDataTypeClass { - GArrowDataTypeClass parent_class; + GArrowFixedSizeBinaryDataTypeClass parent_class; }; -GArrowDecimalDataType *garrow_decimal_data_type_new (gint32 precision, - gint32 scale); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_data_type_new) +GArrowDecimalDataType * +garrow_decimal_data_type_new(gint32 precision, gint32 scale); +#endif gint32 garrow_decimal_data_type_get_precision(GArrowDecimalDataType *decimal_data_type); gint32 garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type); + +#define GARROW_TYPE_DECIMAL128_DATA_TYPE (garrow_decimal128_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128DataType, + garrow_decimal128_data_type, + GARROW, + DECIMAL128_DATA_TYPE, + GArrowDecimalDataType) +struct _GArrowDecimal128DataTypeClass +{ + GArrowDecimalDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_12 +GArrowDecimal128DataType * +garrow_decimal128_data_type_new(gint32 precision, gint32 scale); + G_END_DECLS diff --git a/c_glib/arrow-glib/codec.cpp b/c_glib/arrow-glib/codec.cpp index 45863878e9c7e..7f06fabde74e8 100644 --- a/c_glib/arrow-glib/codec.cpp +++ b/c_glib/arrow-glib/codec.cpp @@ -119,7 +119,7 @@ garrow_codec_class_init(GArrowCodecClass *klass) /** * garrow_codec_new: - * @type: A #GArrowCodompressionType. + * @type: A #GArrowCompressionType. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: A newly created #GArrowCodec on success, %NULL on error. diff --git a/c_glib/arrow-glib/column.cpp b/c_glib/arrow-glib/column.cpp index e3e964f557659..68694b3d67903 100644 --- a/c_glib/arrow-glib/column.cpp +++ b/c_glib/arrow-glib/column.cpp @@ -322,7 +322,10 @@ garrow_column_get_field(GArrowColumn *column) } else { const auto arrow_column = garrow_column_get_raw(column); auto arrow_field = arrow_column->field(); - return garrow_field_new_raw(&arrow_field); + auto data_type = garrow_column_get_data_type(column); + auto field = garrow_field_new_raw(&arrow_field, data_type); + g_object_unref(data_type); + return field; } } diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index c634dbfc3b006..10432e2e56ba3 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -130,8 +130,10 @@ GArrowStructArray *garrow_struct_array_new(GArrowDataType *data_type, GArrowArray *garrow_struct_array_get_field(GArrowStructArray *array, gint i); +#ifndef GARROW_DISABLE_DEPRECATED GARROW_DEPRECATED_IN_0_10_FOR(garrow_struct_array_flatten) GList *garrow_struct_array_get_fields(GArrowStructArray *array); +#endif GARROW_AVAILABLE_IN_0_10 GList *garrow_struct_array_flatten(GArrowStructArray *array, GError **error); diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index a4d3d843617a0..675900a5becc2 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -88,19 +88,34 @@ garrow_list_data_type_new(GArrowField *field) * @list_data_type: A #GArrowListDataType. * * Returns: (transfer full): The field of value. + * + * Deprecated: 0.13.0: + * Use garrow_list_data_type_get_field() instead. */ GArrowField * garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type) { - auto arrow_data_type = - garrow_data_type_get_raw(GARROW_DATA_TYPE(list_data_type)); + return garrow_list_data_type_get_field(list_data_type); +} + +/** + * garrow_list_data_type_get_field: + * @list_data_type: A #GArrowListDataType. + * + * Returns: (transfer full): The field of value. + * + * Since: 0.13.0 + */ +GArrowField * +garrow_list_data_type_get_field(GArrowListDataType *list_data_type) +{ + auto data_type = GARROW_DATA_TYPE(list_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_list_data_type = static_cast(arrow_data_type.get()); auto arrow_field = arrow_list_data_type->value_field(); - auto field = garrow_field_new_raw(&arrow_field); - - return field; + return garrow_field_new_raw(&arrow_field, nullptr); } @@ -143,22 +158,22 @@ garrow_struct_data_type_new(GList *fields) /** * garrow_struct_data_type_get_n_fields: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * * Returns: The number of fields of the struct data type. * * Since: 0.12.0 */ gint -garrow_struct_data_type_get_n_fields(GArrowStructDataType *data_type) +garrow_struct_data_type_get_n_fields(GArrowStructDataType *struct_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(struct_data_type)); return arrow_data_type->num_children(); } /** * garrow_struct_data_type_get_fields: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * * Returns: (transfer full) (element-type GArrowField): * The fields of the struct data type. @@ -166,21 +181,22 @@ garrow_struct_data_type_get_n_fields(GArrowStructDataType *data_type) * Since: 0.12.0 */ GList * -garrow_struct_data_type_get_fields(GArrowStructDataType *data_type) +garrow_struct_data_type_get_fields(GArrowStructDataType *struct_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(struct_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_fields = arrow_data_type->children(); GList *fields = NULL; for (auto arrow_field : arrow_fields) { - fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field)); + fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field, nullptr)); } return g_list_reverse(fields); } /** * garrow_struct_data_type_get_field: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * @i: The index of the target field. * * Returns: (transfer full) (nullable): @@ -189,10 +205,11 @@ garrow_struct_data_type_get_fields(GArrowStructDataType *data_type) * Since: 0.12.0 */ GArrowField * -garrow_struct_data_type_get_field(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field(GArrowStructDataType *struct_data_type, gint i) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(struct_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); if (i < 0) { i += arrow_data_type->num_children(); @@ -206,7 +223,7 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, auto arrow_field = arrow_data_type->child(i); if (arrow_field) { - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, nullptr); } else { return NULL; } @@ -214,7 +231,7 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, /** * garrow_struct_data_type_get_field_by_name: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * @name: The name of the target field. * * Returns: (transfer full) (nullable): @@ -223,16 +240,17 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, * Since: 0.12.0 */ GArrowField * -garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_by_name(GArrowStructDataType *struct_data_type, const gchar *name) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(struct_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); - auto arrow_field = arrow_struct_data_type->GetChildByName(name); + auto arrow_field = arrow_struct_data_type->GetFieldByName(name); if (arrow_field) { - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, nullptr); } else { return NULL; } @@ -240,7 +258,7 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, /** * garrow_struct_data_type_get_field_index: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * @name: The name of the target field. * * Returns: The index of the target index in the struct data type @@ -249,14 +267,14 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, * Since: 0.12.0 */ gint -garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_index(GArrowStructDataType *struct_data_type, const gchar *name) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(struct_data_type)); auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); - return arrow_struct_data_type->GetChildIndex(name); + return arrow_struct_data_type->GetFieldIndex(name); } @@ -276,22 +294,22 @@ garrow_union_data_type_class_init(GArrowUnionDataTypeClass *klass) /** * garrow_union_data_type_get_n_fields: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * * Returns: The number of fields of the union data type. * * Since: 0.12.0 */ gint -garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type) +garrow_union_data_type_get_n_fields(GArrowUnionDataType *union_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(union_data_type)); return arrow_data_type->num_children(); } /** * garrow_union_data_type_get_fields: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * * Returns: (transfer full) (element-type GArrowField): * The fields of the union data type. @@ -299,21 +317,22 @@ garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type) * Since: 0.12.0 */ GList * -garrow_union_data_type_get_fields(GArrowUnionDataType *data_type) +garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(union_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_fields = arrow_data_type->children(); GList *fields = NULL; for (auto arrow_field : arrow_fields) { - fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field)); + fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field, nullptr)); } return g_list_reverse(fields); } /** * garrow_union_data_type_get_field: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * @i: The index of the target field. * * Returns: (transfer full) (nullable): @@ -322,10 +341,11 @@ garrow_union_data_type_get_fields(GArrowUnionDataType *data_type) * Since: 0.12.0 */ GArrowField * -garrow_union_data_type_get_field(GArrowUnionDataType *data_type, - gint i) +garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, + gint i) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(union_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); if (i < 0) { i += arrow_data_type->num_children(); @@ -339,7 +359,7 @@ garrow_union_data_type_get_field(GArrowUnionDataType *data_type, auto arrow_field = arrow_data_type->child(i); if (arrow_field) { - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, nullptr); } else { return NULL; } @@ -347,7 +367,7 @@ garrow_union_data_type_get_field(GArrowUnionDataType *data_type, /** * garrow_union_data_type_get_type_codes: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * @n_type_codes: (out): The number of type codes. * * Returns: (transfer full) (array length=n_type_codes): @@ -358,10 +378,10 @@ garrow_union_data_type_get_field(GArrowUnionDataType *data_type, * Since: 0.12.0 */ guint8 * -garrow_union_data_type_get_type_codes(GArrowUnionDataType *data_type, +garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(union_data_type)); auto arrow_union_data_type = std::static_pointer_cast(arrow_data_type); @@ -515,16 +535,16 @@ garrow_dictionary_data_type_new(GArrowDataType *index_data_type, /** * garrow_dictionary_data_type_get_index_data_type: - * @data_type: The #GArrowDictionaryDataType. + * @dictionary_data_type: The #GArrowDictionaryDataType. * * Returns: (transfer full): The #GArrowDataType of index. * * Since: 0.8.0 */ GArrowDataType * -garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_type) +garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *dictionary_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(dictionary_data_type)); auto arrow_dictionary_data_type = std::static_pointer_cast(arrow_data_type); auto arrow_index_data_type = arrow_dictionary_data_type->index_type(); @@ -533,16 +553,16 @@ garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_t /** * garrow_dictionary_data_type_get_dictionary: - * @data_type: The #GArrowDictionaryDataType. + * @dictionary_data_type: The #GArrowDictionaryDataType. * * Returns: (transfer full): The dictionary as #GArrowArray. * * Since: 0.8.0 */ GArrowArray * -garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type) +garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *dictionary_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(dictionary_data_type)); auto arrow_dictionary_data_type = std::static_pointer_cast(arrow_data_type); auto arrow_dictionary = arrow_dictionary_data_type->dictionary(); @@ -551,16 +571,16 @@ garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type) /** * garrow_dictionary_data_type_is_ordered: - * @data_type: The #GArrowDictionaryDataType. + * @dictionary_data_type: The #GArrowDictionaryDataType. * * Returns: Whether dictionary contents are ordered or not. * * Since: 0.8.0 */ gboolean -garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *data_type) +garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *dictionary_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(dictionary_data_type)); auto arrow_dictionary_data_type = std::static_pointer_cast(arrow_data_type); return arrow_dictionary_data_type->ordered(); diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index 25e1ac3d94929..beb312597d52b 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -22,6 +22,7 @@ #include #include #include +#include G_BEGIN_DECLS @@ -67,7 +68,12 @@ struct _GArrowListDataTypeClass GType garrow_list_data_type_get_type (void) G_GNUC_CONST; GArrowListDataType *garrow_list_data_type_new (GArrowField *field); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_13_FOR(garrow_list_data_type_get_field) GArrowField *garrow_list_data_type_get_value_field (GArrowListDataType *list_data_type); +#endif +GARROW_AVAILABLE_IN_0_13 +GArrowField *garrow_list_data_type_get_field (GArrowListDataType *list_data_type); #define GARROW_TYPE_STRUCT_DATA_TYPE (garrow_struct_data_type_get_type()) @@ -83,17 +89,17 @@ struct _GArrowStructDataTypeClass GArrowStructDataType *garrow_struct_data_type_new (GList *fields); gint -garrow_struct_data_type_get_n_fields(GArrowStructDataType *data_type); +garrow_struct_data_type_get_n_fields(GArrowStructDataType *struct_data_type); GList * -garrow_struct_data_type_get_fields(GArrowStructDataType *data_type); +garrow_struct_data_type_get_fields(GArrowStructDataType *struct_data_type); GArrowField * -garrow_struct_data_type_get_field(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field(GArrowStructDataType *struct_data_type, gint i); GArrowField * -garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_by_name(GArrowStructDataType *struct_data_type, const gchar *name); gint -garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_index(GArrowStructDataType *struct_data_type, const gchar *name); @@ -109,14 +115,14 @@ struct _GArrowUnionDataTypeClass }; gint -garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type); +garrow_union_data_type_get_n_fields(GArrowUnionDataType *union_data_type); GList * -garrow_union_data_type_get_fields(GArrowUnionDataType *data_type); +garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type); GArrowField * -garrow_union_data_type_get_field(GArrowUnionDataType *data_type, +garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, gint i); guint8 * -garrow_union_data_type_get_type_codes(GArrowUnionDataType *data_type, +garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes); @@ -172,11 +178,11 @@ garrow_dictionary_data_type_new(GArrowDataType *index_data_type, GArrowArray *dictionary, gboolean ordered); GArrowDataType * -garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_type); +garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *dictionary_data_type); GArrowArray * -garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type); +garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *dictionary_data_type); gboolean -garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *data_type); +garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *dictionary_data_type); G_END_DECLS diff --git a/c_glib/arrow-glib/decimal.cpp b/c_glib/arrow-glib/decimal128.cpp similarity index 72% rename from c_glib/arrow-glib/decimal.cpp rename to c_glib/arrow-glib/decimal128.cpp index 34eb417a96105..32bdf5fcae6e4 100644 --- a/c_glib/arrow-glib/decimal.cpp +++ b/c_glib/arrow-glib/decimal128.cpp @@ -21,14 +21,14 @@ # include #endif -#include +#include #include G_BEGIN_DECLS /** - * SECTION: decimal - * @title: Decimal classes + * SECTION: decimal128 + * @title: 128-bit decimal class * @include: arrow-glib/arrow-glib.h * * #GArrowDecimal128 is a 128-bit decimal class. @@ -136,6 +136,120 @@ garrow_decimal128_new_integer(const gint64 data) return garrow_decimal128_new_raw(&arrow_decimal); } +/** + * garrow_decimal128_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is equal to the other decimal, %FALSE + * otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal == *arrow_other_decimal; +} + +/** + * garrow_decimal128_not_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal isn't equal to the other decimal, + * %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_not_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal != *arrow_other_decimal; +} + +/** + * garrow_decimal128_less_than: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is less than the other decimal, + * %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_less_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal < *arrow_other_decimal; +} + +/** + * garrow_decimal128_less_than_or_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is less than the other decimal + * or equal to the other decimal, %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_less_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal <= *arrow_other_decimal; +} + +/** + * garrow_decimal128_greater_than: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is greater than the other decimal, + * %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_greater_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal > *arrow_other_decimal; +} + +/** + * garrow_decimal128_greater_than_or_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is greater than the other decimal + * or equal to the other decimal, %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_greater_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal >= *arrow_other_decimal; +} + /** * garrow_decimal128_to_string_scale: * @decimal: A #GArrowDecimal128. diff --git a/c_glib/arrow-glib/decimal.h b/c_glib/arrow-glib/decimal128.h similarity index 69% rename from c_glib/arrow-glib/decimal.h rename to c_glib/arrow-glib/decimal128.h index 918cf3d49b4d2..e7601a457601b 100644 --- a/c_glib/arrow-glib/decimal.h +++ b/c_glib/arrow-glib/decimal128.h @@ -20,6 +20,7 @@ #pragma once #include +#include G_BEGIN_DECLS @@ -37,6 +38,24 @@ struct _GArrowDecimal128Class GArrowDecimal128 *garrow_decimal128_new_string(const gchar *data); GArrowDecimal128 *garrow_decimal128_new_integer(const gint64 data); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_not_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_less_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_less_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_greater_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_greater_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); gchar *garrow_decimal128_to_string_scale(GArrowDecimal128 *decimal, gint32 scale); gchar *garrow_decimal128_to_string(GArrowDecimal128 *decimal); diff --git a/c_glib/arrow-glib/decimal.hpp b/c_glib/arrow-glib/decimal128.hpp similarity index 96% rename from c_glib/arrow-glib/decimal.hpp rename to c_glib/arrow-glib/decimal128.hpp index ce56cfe0bd062..84bf47e409f50 100644 --- a/c_glib/arrow-glib/decimal.hpp +++ b/c_glib/arrow-glib/decimal128.hpp @@ -23,7 +23,7 @@ #include -#include +#include GArrowDecimal128 *garrow_decimal128_new_raw(std::shared_ptr *arrow_decimal128); std::shared_ptr garrow_decimal128_get_raw(GArrowDecimal128 *decimal); diff --git a/c_glib/arrow-glib/field.cpp b/c_glib/arrow-glib/field.cpp index b989d288ec30f..f7250bc6ee634 100644 --- a/c_glib/arrow-glib/field.cpp +++ b/c_glib/arrow-glib/field.cpp @@ -37,11 +37,12 @@ G_BEGIN_DECLS typedef struct GArrowFieldPrivate_ { std::shared_ptr field; + GArrowDataType *data_type; } GArrowFieldPrivate; enum { - PROP_0, - PROP_FIELD + PROP_FIELD = 1, + PROP_DATA_TYPE }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowField, @@ -54,11 +55,22 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowField, GARROW_FIELD(obj))) static void -garrow_field_finalize(GObject *object) +garrow_field_dispose(GObject *object) { - GArrowFieldPrivate *priv; + auto priv = GARROW_FIELD_GET_PRIVATE(object); - priv = GARROW_FIELD_GET_PRIVATE(object); + if (priv->data_type) { + g_object_unref(priv->data_type); + priv->data_type = nullptr; + } + + G_OBJECT_CLASS(garrow_field_parent_class)->dispose(object); +} + +static void +garrow_field_finalize(GObject *object) +{ + auto priv = GARROW_FIELD_GET_PRIVATE(object); priv->field = nullptr; @@ -80,19 +92,9 @@ garrow_field_set_property(GObject *object, priv->field = *static_cast *>(g_value_get_pointer(value)); break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + case PROP_DATA_TYPE: + priv->data_type = GARROW_DATA_TYPE(g_value_dup_object(value)); break; - } -} - -static void -garrow_field_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -107,21 +109,27 @@ garrow_field_init(GArrowField *object) static void garrow_field_class_init(GArrowFieldClass *klass) { - GObjectClass *gobject_class; - GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_field_dispose; gobject_class->finalize = garrow_field_finalize; gobject_class->set_property = garrow_field_set_property; - gobject_class->get_property = garrow_field_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("field", "Field", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_FIELD, spec); + + spec = g_param_spec_object("data-type", + "Data type", + "The data type", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATA_TYPE, spec); } /** @@ -137,7 +145,7 @@ garrow_field_new(const gchar *name, { auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_field = std::make_shared(name, arrow_data_type); - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } /** @@ -157,7 +165,7 @@ garrow_field_new_full(const gchar *name, std::make_shared(name, garrow_data_type_get_raw(data_type), nullable); - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } /** @@ -177,14 +185,13 @@ garrow_field_get_name(GArrowField *field) * garrow_field_get_data_type: * @field: A #GArrowField. * - * Returns: (transfer full): The data type of the field. + * Returns: (transfer none): The data type of the field. */ GArrowDataType * garrow_field_get_data_type(GArrowField *field) { - const auto arrow_field = garrow_field_get_raw(field); - auto type = arrow_field->type(); - return garrow_data_type_new_raw(&type); + auto priv = GARROW_FIELD_GET_PRIVATE(field); + return priv->data_type; } /** @@ -233,11 +240,22 @@ garrow_field_to_string(GArrowField *field) G_END_DECLS GArrowField * -garrow_field_new_raw(std::shared_ptr *arrow_field) +garrow_field_new_raw(std::shared_ptr *arrow_field, + GArrowDataType *data_type) { + bool data_type_need_unref = false; + if (!data_type) { + auto arrow_data_type = (*arrow_field)->type(); + data_type = garrow_data_type_new_raw(&arrow_data_type); + data_type_need_unref = true; + } auto field = GARROW_FIELD(g_object_new(GARROW_TYPE_FIELD, "field", arrow_field, + "data-type", data_type, NULL)); + if (data_type_need_unref) { + g_object_unref(data_type); + } return field; } diff --git a/c_glib/arrow-glib/field.hpp b/c_glib/arrow-glib/field.hpp index e130ad5992409..f8d0d46c97ab4 100644 --- a/c_glib/arrow-glib/field.hpp +++ b/c_glib/arrow-glib/field.hpp @@ -23,5 +23,6 @@ #include -GArrowField *garrow_field_new_raw(std::shared_ptr *arrow_field); +GArrowField *garrow_field_new_raw(std::shared_ptr *arrow_field, + GArrowDataType *data_type); std::shared_ptr garrow_field_get_raw(GArrowField *field); diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index cb36e49067ac9..cb1fb3b04a68e 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -325,6 +325,30 @@ garrow_seekable_input_stream_read_at(GArrowSeekableInputStream *input_stream, } +/** + * garrow_seekable_input_stream_peek: + * @input_stream: A #GArrowSeekableInputStream. + * @n_bytes: The number of bytes to be peeked. + * + * Returns: (transfer full): The data of the buffer, up to the + * indicated number. The data becomes invalid after any operation on + * the stream. If the stream is unbuffered, the data is empty. + * + * It should be freed with g_bytes_unref() when no longer needed. + * + * Since: 0.12.0 + */ +GBytes * +garrow_seekable_input_stream_peek(GArrowSeekableInputStream *input_stream, + gint64 n_bytes) +{ + auto arrow_random_access_file = + garrow_seekable_input_stream_get_raw(input_stream); + auto string_view = arrow_random_access_file->Peek(n_bytes); + return g_bytes_new_static(string_view.data(), string_view.size()); +} + + typedef struct GArrowBufferInputStreamPrivate_ { GArrowBuffer *buffer; } GArrowBufferInputStreamPrivate; diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 9deebd717363b..745b912749eb6 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -66,6 +66,9 @@ GArrowBuffer *garrow_seekable_input_stream_read_at(GArrowSeekableInputStream *in gint64 position, gint64 n_bytes, GError **error); +GARROW_AVAILABLE_IN_0_12 +GBytes *garrow_seekable_input_stream_peek(GArrowSeekableInputStream *input_stream, + gint64 n_bytes); #define GARROW_TYPE_BUFFER_INPUT_STREAM \ diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index d962ec103175e..14126bee8d784 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -27,7 +27,7 @@ sources = files( 'column.cpp', 'composite-array.cpp', 'composite-data-type.cpp', - 'decimal.cpp', + 'decimal128.cpp', 'error.cpp', 'field.cpp', 'record-batch.cpp', @@ -77,7 +77,7 @@ c_headers = files( 'composite-array.h', 'composite-data-type.h', 'data-type.h', - 'decimal.h', + 'decimal128.h', 'error.h', 'field.h', 'gobject-type.h', @@ -128,7 +128,7 @@ cpp_headers = files( 'codec.hpp', 'column.hpp', 'data-type.hpp', - 'decimal.hpp', + 'decimal128.hpp', 'error.hpp', 'field.hpp', 'record-batch.hpp', diff --git a/c_glib/arrow-glib/orc-file-reader.cpp b/c_glib/arrow-glib/orc-file-reader.cpp index bde3cfc8fa04f..31905a2f9fea1 100644 --- a/c_glib/arrow-glib/orc-file-reader.cpp +++ b/c_glib/arrow-glib/orc-file-reader.cpp @@ -199,8 +199,7 @@ garrow_orc_file_reader_new(GArrowSeekableInputStream *input, * Since: 0.10.0 * * Deprecated: 0.12.0: - * Use garrow_orc_file_reader_set_field_indices() instead. - * + * Use garrow_orc_file_reader_set_field_indices() instead. */ void garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, diff --git a/c_glib/arrow-glib/orc-file-reader.h b/c_glib/arrow-glib/orc-file-reader.h index 9b2dbadefe43a..9551d52e0fd55 100644 --- a/c_glib/arrow-glib/orc-file-reader.h +++ b/c_glib/arrow-glib/orc-file-reader.h @@ -39,22 +39,24 @@ garrow_orc_file_reader_new(GArrowSeekableInputStream *file, GError **error); #ifndef GARROW_DISABLE_DEPRECATED -G_GNUC_DEPRECATED_FOR(garrow_orc_file_reader_set_field_indices) +GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_set_field_indices) void garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, const gint *field_indexes, guint n_field_indexes); #endif +GARROW_AVAILABLE_IN_0_12 void garrow_orc_file_reader_set_field_indices(GArrowORCFileReader *reader, const gint *field_indices, guint n_field_indices); #ifndef GARROW_DISABLE_DEPRECATED -G_GNUC_DEPRECATED_FOR(garrow_orc_file_reader_get_field_indices) +GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_get_field_indices) const gint * garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader, guint *n_field_indexes); #endif +GARROW_AVAILABLE_IN_0_12 const gint * garrow_orc_file_reader_get_field_indices(GArrowORCFileReader *reader, guint *n_field_indices); diff --git a/c_glib/arrow-glib/schema.cpp b/c_glib/arrow-glib/schema.cpp index 1affaaede766b..1bbe82f9a3ca6 100644 --- a/c_glib/arrow-glib/schema.cpp +++ b/c_glib/arrow-glib/schema.cpp @@ -21,6 +21,7 @@ # include #endif +#include #include #include #include @@ -173,7 +174,7 @@ garrow_schema_get_field(GArrowSchema *schema, guint i) { const auto arrow_schema = garrow_schema_get_raw(schema); auto arrow_field = arrow_schema->field(i); - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, nullptr); } /** @@ -192,7 +193,8 @@ garrow_schema_get_field_by_name(GArrowSchema *schema, if (arrow_field == nullptr) { return NULL; } else { - return garrow_field_new_raw(&arrow_field); + auto arrow_data_type = arrow_field->type(); + return garrow_field_new_raw(&arrow_field, nullptr); } } @@ -223,7 +225,7 @@ garrow_schema_get_fields(GArrowSchema *schema) GList *fields = NULL; for (auto arrow_field : arrow_schema->fields()) { - GArrowField *field = garrow_field_new_raw(&arrow_field); + auto field = garrow_field_new_raw(&arrow_field, nullptr); fields = g_list_prepend(fields, field); } diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index f9e1b951a3658..b889eb2c9da23 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -21,8 +21,10 @@ # include #endif +#include #include #include +#include #include #include @@ -133,22 +135,218 @@ garrow_table_class_init(GArrowTableClass *klass) * @columns: (element-type GArrowColumn): The columns of the table. * * Returns: A newly created #GArrowTable. + * + * Deprecated: 0.12.0: Use garrow_table_new_values() instead. */ GArrowTable * garrow_table_new(GArrowSchema *schema, GList *columns) { + auto arrow_schema = garrow_schema_get_raw(schema); std::vector> arrow_columns; for (GList *node = columns; node; node = node->next) { - GArrowColumn *column = GARROW_COLUMN(node->data); + auto column = GARROW_COLUMN(node->data); arrow_columns.push_back(garrow_column_get_raw(column)); } - auto arrow_table = - arrow::Table::Make(garrow_schema_get_raw(schema), arrow_columns); + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); return garrow_table_new_raw(&arrow_table); } +/** + * garrow_table_new_values: (skip) + * @schema: The schema of the table. + * @values: The values of the table. All values must be instance of the + * same class. Available classes are #GArrowColumn, #GArrowArray and + * #GArrowRecordBatch. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_values(GArrowSchema *schema, + GList *values, + GError **error) +{ + const auto context = "[table][new][values]"; + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_columns; + std::vector> arrow_arrays; + std::vector> arrow_record_batches; + for (GList *node = values; node; node = node->next) { + if (GARROW_IS_COLUMN(node->data)) { + auto column = GARROW_COLUMN(node->data); + arrow_columns.push_back(garrow_column_get_raw(column)); + } else if (GARROW_IS_ARRAY(node->data)) { + auto array = GARROW_ARRAY(node->data); + arrow_arrays.push_back(garrow_array_get_raw(array)); + } else if (GARROW_IS_RECORD_BATCH(node->data)) { + auto record_batch = GARROW_RECORD_BATCH(node->data); + arrow_record_batches.push_back(garrow_record_batch_get_raw(record_batch)); + } else { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: %s", + context, + "value must be one of " + "GArrowColumn, GArrowArray and GArrowRecordBatch"); + return NULL; + } + } + + size_t n_types = 0; + if (!arrow_columns.empty()) { + ++n_types; + } + if (!arrow_arrays.empty()) { + ++n_types; + } + if (!arrow_record_batches.empty()) { + ++n_types; + } + if (n_types > 1) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: %s", + context, + "all values must be the same objects of " + "GArrowColumn, GArrowArray or GArrowRecordBatch"); + return NULL; + } + + if (!arrow_columns.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } else if (!arrow_arrays.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } else { + std::shared_ptr arrow_table; + auto status = arrow::Table::FromRecordBatches(arrow_schema, + arrow_record_batches, + &arrow_table); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } +} + +/** + * garrow_table_new_columns: + * @schema: The schema of the table. + * @columns: (array length=n_columns): The columns of the table. + * @n_columns: The number of columns. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_columns(GArrowSchema *schema, + GArrowColumn **columns, + gsize n_columns, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_columns; + for (gsize i = 0; i < n_columns; ++i) { + arrow_columns.push_back(garrow_column_get_raw(columns[i])); + } + + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, "[table][new][columns]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * garrow_table_new_arrays: + * @schema: The schema of the table. + * @arrays: (array length=n_arrays): The arrays of the table. + * @n_arrays: The number of arrays. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_arrays(GArrowSchema *schema, + GArrowArray **arrays, + gsize n_arrays, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_arrays; + for (gsize i = 0; i < n_arrays; ++i) { + arrow_arrays.push_back(garrow_array_get_raw(arrays[i])); + } + + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, "[table][new][arrays]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * garrow_table_new_record_batches: + * @schema: The schema of the table. + * @record_batches: (array length=n_record_batches): The record batches + * that have data for the table. + * @n_record_batches: The number of record batches. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_record_batches(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_record_batches; + for (gsize i = 0; i < n_record_batches; ++i) { + auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); + arrow_record_batches.push_back(arrow_record_batch); + } + + std::shared_ptr arrow_table; + auto status = arrow::Table::FromRecordBatches(arrow_schema, + arrow_record_batches, + &arrow_table); + if (garrow_error_check(error, status, "[table][new][record-batches]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + /** * garrow_table_equal: * @table: A #GArrowTable. diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index ef7b0f5c289ce..bde2535033c7d 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -20,7 +20,9 @@ #pragma once #include +#include #include +#include G_BEGIN_DECLS @@ -35,8 +37,35 @@ struct _GArrowTableClass GObjectClass parent_class; }; -GArrowTable *garrow_table_new (GArrowSchema *schema, - GList *columns); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_table_new_values) +GArrowTable * +garrow_table_new(GArrowSchema *schema, + GList *columns); +#endif +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_values(GArrowSchema *schema, + GList *values, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_columns(GArrowSchema *schema, + GArrowColumn **columns, + gsize n_columns, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_arrays(GArrowSchema *schema, + GArrowArray **arrays, + gsize n_arrays, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_record_batches(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches, + GError **error); gboolean garrow_table_equal (GArrowTable *table, GArrowTable *other_table); diff --git a/c_glib/arrow-glib/type.cpp b/c_glib/arrow-glib/type.cpp index 0642004e2f07b..e227ed2c31fc8 100644 --- a/c_glib/arrow-glib/type.cpp +++ b/c_glib/arrow-glib/type.cpp @@ -66,6 +66,8 @@ garrow_type_from_raw(arrow::Type::type type) return GARROW_TYPE_STRING; case arrow::Type::type::BINARY: return GARROW_TYPE_BINARY; + case arrow::Type::type::FIXED_SIZE_BINARY: + return GARROW_TYPE_FIXED_SIZE_BINARY; case arrow::Type::type::DATE32: return GARROW_TYPE_DATE32; case arrow::Type::type::DATE64: diff --git a/c_glib/arrow-glib/type.h b/c_glib/arrow-glib/type.h index 2137c785515f8..85f55c452be55 100644 --- a/c_glib/arrow-glib/type.h +++ b/c_glib/arrow-glib/type.h @@ -40,6 +40,8 @@ G_BEGIN_DECLS * @GARROW_TYPE_DOUBLE: 8-byte floating point value. * @GARROW_TYPE_STRING: UTF-8 variable-length string. * @GARROW_TYPE_BINARY: Variable-length bytes (no guarantee of UTF-8-ness). + * @GARROW_TYPE_FIXED_SIZE_BINARY: Fixed-size binary. Each value occupies + * the same number of bytes. * @GARROW_TYPE_DATE32: int32 days since the UNIX epoch. * @GARROW_TYPE_DATE64: int64 milliseconds since the UNIX epoch. * @GARROW_TYPE_TIMESTAMP: Exact timestamp encoded with int64 since UNIX epoch. @@ -72,6 +74,7 @@ typedef enum { GARROW_TYPE_DOUBLE, GARROW_TYPE_STRING, GARROW_TYPE_BINARY, + GARROW_TYPE_FIXED_SIZE_BINARY, GARROW_TYPE_DATE32, GARROW_TYPE_DATE64, GARROW_TYPE_TIMESTAMP, diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index eb734250e2352..827b9c9a813b8 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,24 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_0_13: + * + * You can use this macro value for compile time API version check. + * + * Since: 0.13.0 + */ +#define GARROW_VERSION_0_13 G_ENCODE_VERSION(0, 13) + +/** + * GARROW_VERSION_0_12: + * + * You can use this macro value for compile time API version check. + * + * Since: 0.12.0 + */ +#define GARROW_VERSION_0_12 G_ENCODE_VERSION(0, 12) + /** * GARROW_VERSION_0_10: * @@ -166,6 +184,34 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_13 +# define GARROW_DEPRECATED_IN_0_13 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_0_13_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_0_13 +# define GARROW_DEPRECATED_IN_0_13_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_13 +# define GARROW_AVAILABLE_IN_0_13 GARROW_UNAVAILABLE(0, 13) +#else +# define GARROW_AVAILABLE_IN_0_13 +#endif + +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_12 +# define GARROW_DEPRECATED_IN_0_12 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_0_12_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_0_12 +# define GARROW_DEPRECATED_IN_0_12_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_12 +# define GARROW_AVAILABLE_IN_0_12 GARROW_UNAVAILABLE(0, 12) +#else +# define GARROW_AVAILABLE_IN_0_12 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_10 # define GARROW_DEPRECATED_IN_0_10 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_0_10_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/configure.ac b/c_glib/configure.ac index a6d8ed8e1d185..75654d2d7be25 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -17,12 +17,7 @@ AC_PREREQ(2.65) -m4_define([arrow_glib_version], - m4_esyscmd(grep "^ " "$(dirname $0)/../java/pom.xml" | \ - sed -E \ - -e 's/(^ )//g' \ - -e 's/(<\/version>$)//g' | \ - tr -d '\n')) +m4_define([arrow_glib_version], 0.13.0-SNAPSHOT) AC_INIT([arrow-glib], arrow_glib_version, [https://issues.apache.org/jira/browse/ARROW], @@ -283,7 +278,6 @@ AC_CONFIG_FILES([ doc/plasma-glib/entities.xml example/Makefile example/lua/Makefile - tool/Makefile ]) AC_OUTPUT diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 17b75005ff97a..1016703001b8c 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -53,7 +53,7 @@ Decimal - + Tensor @@ -163,6 +163,10 @@ Index of deprecated API + + Index of new symbols in 0.13.0 + + Index of new symbols in 0.12.0 diff --git a/c_glib/doc/parquet-glib/parquet-glib-docs.xml b/c_glib/doc/parquet-glib/parquet-glib-docs.xml index 0f2c30ba7863f..4485a6765cb6b 100644 --- a/c_glib/doc/parquet-glib/parquet-glib-docs.xml +++ b/c_glib/doc/parquet-glib/parquet-glib-docs.xml @@ -57,6 +57,10 @@ Index of deprecated API + + Index of new symbols in 0.12.0 + + Index of new symbols in 0.11.0 diff --git a/c_glib/example/build.c b/c_glib/example/build.c index 8c6cf74d74815..9b2d58d2b2bba 100644 --- a/c_glib/example/build.c +++ b/c_glib/example/build.c @@ -33,13 +33,13 @@ main(int argc, char **argv) builder = garrow_int32_array_builder_new(); if (success) { - success = garrow_int32_array_builder_append(builder, 29, &error); + success = garrow_int32_array_builder_append_value(builder, 29, &error); } if (success) { - success = garrow_int32_array_builder_append(builder, 2929, &error); + success = garrow_int32_array_builder_append_value(builder, 2929, &error); } if (success) { - success = garrow_int32_array_builder_append(builder, 292929, &error); + success = garrow_int32_array_builder_append_value(builder, 292929, &error); } if (!success) { g_print("failed to append: %s\n", error->message); diff --git a/c_glib/example/lua/Makefile.am b/c_glib/example/lua/Makefile.am index 86bdbed8a0228..9019d24741c1a 100644 --- a/c_glib/example/lua/Makefile.am +++ b/c_glib/example/lua/Makefile.am @@ -20,6 +20,5 @@ dist_lua_example_DATA = \ README.md \ read-batch.lua \ read-stream.lua \ - stream-to-torch-tensor.lua \ write-batch.lua \ write-stream.lua diff --git a/c_glib/example/lua/README.md b/c_glib/example/lua/README.md index e7e3351fef148..7d388d46acb33 100644 --- a/c_glib/example/lua/README.md +++ b/c_glib/example/lua/README.md @@ -48,8 +48,3 @@ Here are example codes in this directory: * `read-stream.lua`: It shows how to read Arrow array from file in stream mode. - - * `stream-to-torch-tensor.lua`: It shows how to read Arrow array - from file in stream mode and convert it to - [Torch](http://torch.ch/)'s - [`Tensor` object](http://torch7.readthedocs.io/en/rtd/tensor/index.html). diff --git a/c_glib/example/lua/stream-to-torch-tensor.lua b/c_glib/example/lua/stream-to-torch-tensor.lua deleted file mode 100644 index fc765e3c96872..0000000000000 --- a/c_glib/example/lua/stream-to-torch-tensor.lua +++ /dev/null @@ -1,101 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - -local lgi = require 'lgi' -local Arrow = lgi.Arrow - -local torch = require 'torch' - -Arrow.Array.torch_types = function(self) - return nil -end - -Arrow.Array.to_torch = function(self) - local types = self:torch_types() - if not types then - return nil - end - - local storage_type = types[1] - local tensor_type = types[2] - - local size = self:get_length() - local storage = storage_type(size) - if not storage then - return nil - end - - for i = 1, size do - storage[i] = self:get_value(i - 1) - end - return tensor_type(storage) -end - -Arrow.UInt8Array.torch_types = function(self) - return {torch.ByteStorage, torch.ByteTensor} -end - -Arrow.Int8Array.torch_types = function(self) - return {torch.CharStorage, torch.CharTensor} -end - -Arrow.Int16Array.torch_types = function(self) - return {torch.ShortStorage, torch.ShortTensor} -end - -Arrow.Int32Array.torch_types = function(self) - return {torch.IntStorage, torch.IntTensor} -end - -Arrow.Int64Array.torch_types = function(self) - return {torch.LongStorage, torch.LongTensor} -end - -Arrow.FloatArray.torch_types = function(self) - return {torch.FloatStorage, torch.FloatTensor} -end - -Arrow.DoubleArray.torch_types = function(self) - return {torch.DoubleStorage, torch.DoubleTensor} -end - - -local input_path = arg[1] or "/tmp/stream.arrow"; - -local input = Arrow.MemoryMappedInputStream.new(input_path) -local reader = Arrow.RecordBatchStreamReader.new(input) - -local i = 0 -while true do - local record_batch = reader:read_next_record_batch() - if not record_batch then - break - end - - print(string.rep("=", 40)) - print("record-batch["..i.."]:") - for j = 0, record_batch:get_n_columns() - 1 do - local column = record_batch:get_column(j) - local column_name = record_batch:get_column_name(j) - print(" "..column_name..":") - print(column:to_torch()) - end - - i = i + 1 -end - -input:close() diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index 49d1d0b7168df..b2adf8560f246 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -22,10 +22,20 @@ #endif #include +#include #include #include +template +Type +ggandiva_literal_node_get(GGandivaLiteralNode *node) +{ + auto gandiva_literal_node = + std::static_pointer_cast(ggandiva_node_get_raw(GGANDIVA_NODE(node))); + return gandiva_literal_node->holder().get(); +} + G_BEGIN_DECLS /** @@ -40,15 +50,64 @@ G_BEGIN_DECLS * * #GGandivaFunctionNode is a class for a node in the expression tree, representing a function. * + * #GGandivaLiteralNode is a base class for a node in the expression tree, + * representing a literal. + * + * #GGandivaNullLiteralNode is a class for a node in the expression tree, + * representing a null literal. + * + * #GGandivaBooleanLiteralNode is a class for a node in the expression tree, + * representing a boolean literal. + * + * #GGandivaInt8LiteralNode is a class for a node in the expression tree, + * representing a 8-bit integer literal. + * + * #GGandivaUInt8LiteralNode is a class for a node in the expression tree, + * representing a 8-bit unsigned integer literal. + * + * #GGandivaInt16LiteralNode is a class for a node in the expression tree, + * representing a 16-bit integer literal. + * + * #GGandivaUInt16LiteralNode is a class for a node in the expression tree, + * representing a 16-bit unsigned integer literal. + * + * #GGandivaInt32LiteralNode is a class for a node in the expression tree, + * representing a 32-bit integer literal. + * + * #GGandivaUInt32LiteralNode is a class for a node in the expression tree, + * representing a 32-bit unsigned integer literal. + * + * #GGandivaInt64LiteralNode is a class for a node in the expression tree, + * representing a 64-bit integer literal. + * + * #GGandivaUInt64LiteralNode is a class for a node in the expression tree, + * representing a 64-bit unsigned integer literal. + * + * #GGandivaFloatLiteralNode is a class for a node in the expression tree, + * representing a 32-bit floating point literal. + * + * #GGandivaDoubleLiteralNode is a class for a node in the expression tree, + * representing a 64-bit floating point literal. + * + * #GGandivaBinaryLiteralNode is a class for a node in the expression tree, + * representing a binary literal. + * + * #GGandivaStringLiteralNode is a class for a node in the expression tree, + * representing an UTF-8 encoded string literal. + * + * #GGandivaIfNode is a class for a node in the expression tree, representing an if-else. + * * Since: 0.12.0 */ typedef struct GGandivaNodePrivate_ { std::shared_ptr node; + GArrowDataType *return_type; } GGandivaNodePrivate; enum { - PROP_NODE = 1 + PROP_NODE = 1, + PROP_RETURN_TYPE }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, @@ -60,6 +119,19 @@ G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, ggandiva_node_get_instance_private( \ GGANDIVA_NODE(object))) +static void +ggandiva_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(object); + + if (priv->return_type) { + g_object_unref(priv->return_type); + priv->return_type = nullptr; + } + + G_OBJECT_CLASS(ggandiva_node_parent_class)->dispose(object); +} + static void ggandiva_node_finalize(GObject *object) { @@ -83,6 +155,27 @@ ggandiva_node_set_property(GObject *object, priv->node = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_RETURN_TYPE: + priv->return_type = GARROW_DATA_TYPE(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_node_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_RETURN_TYPE: + g_value_set_object(value, priv->return_type); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -97,19 +190,28 @@ ggandiva_node_init(GGandivaNode *object) static void ggandiva_node_class_init(GGandivaNodeClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = ggandiva_node_dispose; gobject_class->finalize = ggandiva_node_finalize; gobject_class->set_property = ggandiva_node_set_property; + gobject_class->get_property = ggandiva_node_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("node", "Node", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_NODE, spec); + + spec = g_param_spec_object("return-type", + "Return type", + "The return type", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_RETURN_TYPE, spec); } @@ -223,12 +325,10 @@ ggandiva_field_node_new(GArrowField *field) typedef struct GGandivaFunctionNodePrivate_ { gchar *name; GList *parameters; - GArrowDataType *return_type; } GGandivaFunctionNodePrivate; enum { - PROP_NAME = 1, - PROP_RETURN_TYPE + PROP_NAME = 1 }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaFunctionNode, @@ -254,11 +354,6 @@ ggandiva_function_node_dispose(GObject *object) priv->parameters = nullptr; } - if (priv->return_type) { - g_object_unref(priv->return_type); - priv->return_type = nullptr; - } - G_OBJECT_CLASS(ggandiva_function_node_parent_class)->dispose(object); } @@ -284,9 +379,6 @@ ggandiva_function_node_set_property(GObject *object, case PROP_NAME: priv->name = g_value_dup_string(value); break; - case PROP_RETURN_TYPE: - priv->return_type = GARROW_DATA_TYPE(g_value_dup_object(value)); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -305,9 +397,6 @@ ggandiva_function_node_get_property(GObject *object, case PROP_NAME: g_value_set_string(value, priv->name); break; - case PROP_RETURN_TYPE: - g_value_set_object(value, priv->return_type); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -339,14 +428,6 @@ ggandiva_function_node_class_init(GGandivaFunctionNodeClass *klass) static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_NAME, spec); - - spec = g_param_spec_object("return-type", - "Return type", - "The return type of the function", - GARROW_TYPE_DATA_TYPE, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RETURN_TYPE, spec); } /** @@ -395,42 +476,1039 @@ ggandiva_function_node_get_parameters(GGandivaFunctionNode *node) return priv->parameters; } -G_END_DECLS -std::shared_ptr -ggandiva_node_get_raw(GGandivaNode *node) +G_DEFINE_TYPE(GGandivaLiteralNode, + ggandiva_literal_node, + GGANDIVA_TYPE_NODE) + +static void +ggandiva_literal_node_init(GGandivaLiteralNode *literal_node) { - auto priv = GGANDIVA_NODE_GET_PRIVATE(node); - return priv->node; } -GGandivaFieldNode * -ggandiva_field_node_new_raw(std::shared_ptr *gandiva_node, - GArrowField *field) +static void +ggandiva_literal_node_class_init(GGandivaLiteralNodeClass *klass) { - auto field_node = g_object_new(GGANDIVA_TYPE_FIELD_NODE, - "node", gandiva_node, - "field", field, - NULL); - return GGANDIVA_FIELD_NODE(field_node); } -GGandivaFunctionNode * -ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, - const gchar *name, - GList *parameters, - GArrowDataType *return_type) + +G_DEFINE_TYPE(GGandivaNullLiteralNode, + ggandiva_null_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_null_literal_node_init(GGandivaNullLiteralNode *null_literal_node) { - auto function_node = g_object_new(GGANDIVA_TYPE_FUNCTION_NODE, - "node", gandiva_node, - "name", name, - "return-type", return_type, - NULL); - auto priv = GGANDIVA_FUNCTION_NODE_GET_PRIVATE(function_node); - for (auto node = parameters; node; node = g_list_next(node)) { - auto parameter = GGANDIVA_NODE(node->data); - priv->parameters = g_list_prepend(priv->parameters, g_object_ref(parameter)); +} + +static void +ggandiva_null_literal_node_class_init(GGandivaNullLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_null_literal_node_new: + * @return_type: A #GArrowDataType. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaNullLiteralNode for + * the type or %NULL on error. + * + * Since: 0.12.0 + */ +GGandivaNullLiteralNode * +ggandiva_null_literal_node_new(GArrowDataType *return_type, + GError **error) +{ + auto arrow_return_type = garrow_data_type_get_raw(return_type); + auto gandiva_node = gandiva::TreeExprBuilder::MakeNull(arrow_return_type); + if (!gandiva_node) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][null-literal-node][new] " + "failed to create: <%s>", + arrow_return_type->ToString().c_str()); + return NULL; } - priv->parameters = g_list_reverse(priv->parameters); - return GGANDIVA_FUNCTION_NODE(function_node); + return GGANDIVA_NULL_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + return_type)); +} + + +G_DEFINE_TYPE(GGandivaBooleanLiteralNode, + ggandiva_boolean_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_boolean_literal_node_init(GGandivaBooleanLiteralNode *boolean_literal_node) +{ +} + +static void +ggandiva_boolean_literal_node_class_init(GGandivaBooleanLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_boolean_literal_node_new: + * @value: The value of the boolean literal. + * + * Returns: A newly created #GGandivaBooleanLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBooleanLiteralNode * +ggandiva_boolean_literal_node_new(gboolean value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(static_cast(value)); + return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_boolean_literal_node_get_value: + * @node: A #GGandivaBooleanLiteralNode. + * + * Returns: The value of the boolean literal. + * + * Since: 0.12.0 + */ +gboolean +ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node) +{ + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + return static_cast(value); +} + + +G_DEFINE_TYPE(GGandivaInt8LiteralNode, + ggandiva_int8_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int8_literal_node_init(GGandivaInt8LiteralNode *int8_literal_node) +{ +} + +static void +ggandiva_int8_literal_node_class_init(GGandivaInt8LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int8_literal_node_new: + * @value: The value of the 8-bit integer literal. + * + * Returns: A newly created #GGandivaInt8LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt8LiteralNode * +ggandiva_int8_literal_node_new(gint8 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_int8_literal_node_get_value: + * @node: A #GGandivaInt8LiteralNode. + * + * Returns: The value of the 8-bit integer literal. + * + * Since: 0.12.0 + */ +gint8 +ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt8LiteralNode, + ggandiva_uint8_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint8_literal_node_init(GGandivaUInt8LiteralNode *uint8_literal_node) +{ +} + +static void +ggandiva_uint8_literal_node_class_init(GGandivaUInt8LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint8_literal_node_new: + * @value: The value of the 8-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt8LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt8LiteralNode * +ggandiva_uint8_literal_node_new(guint8 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_uint8_literal_node_get_value: + * @node: A #GGandivaUInt8LiteralNode. + * + * Returns: The value of the 8-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint8 +ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt16LiteralNode, + ggandiva_int16_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int16_literal_node_init(GGandivaInt16LiteralNode *int16_literal_node) +{ +} + +static void +ggandiva_int16_literal_node_class_init(GGandivaInt16LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int16_literal_node_new: + * @value: The value of the 16-bit integer literal. + * + * Returns: A newly created #GGandivaInt16LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt16LiteralNode * +ggandiva_int16_literal_node_new(gint16 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_int16_literal_node_get_value: + * @node: A #GGandivaInt16LiteralNode. + * + * Returns: The value of the 16-bit integer literal. + * + * Since: 0.12.0 + */ +gint16 +ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt16LiteralNode, + ggandiva_uint16_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint16_literal_node_init(GGandivaUInt16LiteralNode *uint16_literal_node) +{ +} + +static void +ggandiva_uint16_literal_node_class_init(GGandivaUInt16LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint16_literal_node_new: + * @value: The value of the 16-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt16LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt16LiteralNode * +ggandiva_uint16_literal_node_new(guint16 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_uint16_literal_node_get_value: + * @node: A #GGandivaUInt16LiteralNode. + * + * Returns: The value of the 16-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint16 +ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt32LiteralNode, + ggandiva_int32_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int32_literal_node_init(GGandivaInt32LiteralNode *int32_literal_node) +{ +} + +static void +ggandiva_int32_literal_node_class_init(GGandivaInt32LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int32_literal_node_new: + * @value: The value of the 32-bit integer literal. + * + * Returns: A newly created #GGandivaInt32LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt32LiteralNode * +ggandiva_int32_literal_node_new(gint32 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_int32_literal_node_get_value: + * @node: A #GGandivaInt32LiteralNode. + * + * Returns: The value of the 32-bit integer literal. + * + * Since: 0.12.0 + */ +gint32 +ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt32LiteralNode, + ggandiva_uint32_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint32_literal_node_init(GGandivaUInt32LiteralNode *uint32_literal_node) +{ +} + +static void +ggandiva_uint32_literal_node_class_init(GGandivaUInt32LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint32_literal_node_new: + * @value: The value of the 32-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt32LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt32LiteralNode * +ggandiva_uint32_literal_node_new(guint32 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_uint32_literal_node_get_value: + * @node: A #GGandivaUInt32LiteralNode. + * + * Returns: The value of the 32-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint32 +ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt64LiteralNode, + ggandiva_int64_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int64_literal_node_init(GGandivaInt64LiteralNode *int64_literal_node) +{ +} + +static void +ggandiva_int64_literal_node_class_init(GGandivaInt64LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int64_literal_node_new: + * @value: The value of the 64-bit integer literal. + * + * Returns: A newly created #GGandivaInt64LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt64LiteralNode * +ggandiva_int64_literal_node_new(gint64 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_int64_literal_node_get_value: + * @node: A #GGandivaInt64LiteralNode. + * + * Returns: The value of the 64-bit integer literal. + * + * Since: 0.12.0 + */ +gint64 +ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt64LiteralNode, + ggandiva_uint64_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint64_literal_node_init(GGandivaUInt64LiteralNode *uint64_literal_node) +{ +} + +static void +ggandiva_uint64_literal_node_class_init(GGandivaUInt64LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint64_literal_node_new: + * @value: The value of the 64-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt64LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt64LiteralNode * +ggandiva_uint64_literal_node_new(guint64 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_uint64_literal_node_get_value: + * @node: A #GGandivaUInt64LiteralNode. + * + * Returns: The value of the 64-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint64 +ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaFloatLiteralNode, + ggandiva_float_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_float_literal_node_init(GGandivaFloatLiteralNode *float_literal_node) +{ +} + +static void +ggandiva_float_literal_node_class_init(GGandivaFloatLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_float_literal_node_new: + * @value: The value of the 32-bit floating point literal. + * + * Returns: A newly created #GGandivaFloatLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaFloatLiteralNode * +ggandiva_float_literal_node_new(gfloat value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_float_literal_node_get_value: + * @node: A #GGandivaFloatLiteralNode. + * + * Returns: The value of the 32-bit floating point literal. + * + * Since: 0.12.0 + */ +gfloat +ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaDoubleLiteralNode, + ggandiva_double_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_double_literal_node_init(GGandivaDoubleLiteralNode *double_literal_node) +{ +} + +static void +ggandiva_double_literal_node_class_init(GGandivaDoubleLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_double_literal_node_new: + * @value: The value of the 64-bit floating point literal. + * + * Returns: A newly created #GGandivaDoubleLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaDoubleLiteralNode * +ggandiva_double_literal_node_new(gdouble value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_double_literal_node_get_value: + * @node: A #GGandivaDoubleLiteralNode. + * + * Returns: The value of the 64-bit floating point literal. + * + * Since: 0.12.0 + */ +gdouble +ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +typedef struct GGandivaBinaryLiteralNodePrivate_ { + GBytes *value; +} GGandivaBinaryLiteralNodePrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GGandivaBinaryLiteralNode, + ggandiva_binary_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +#define GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(object) \ + static_cast( \ + ggandiva_binary_literal_node_get_instance_private( \ + GGANDIVA_BINARY_LITERAL_NODE(object))) + +static void +ggandiva_binary_literal_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(object); + + if (priv->value) { + g_bytes_unref(priv->value); + priv->value = nullptr; + } + + G_OBJECT_CLASS(ggandiva_binary_literal_node_parent_class)->dispose(object); +} + +static void +ggandiva_binary_literal_node_init(GGandivaBinaryLiteralNode *binary_literal_node) +{ +} + +static void +ggandiva_binary_literal_node_class_init(GGandivaBinaryLiteralNodeClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = ggandiva_binary_literal_node_dispose; +} + +/** + * ggandiva_binary_literal_node_new: + * @value: (array length=size): The value of the binary literal. + * @size: The number of bytes of the value. + * + * Returns: A newly created #GGandivaBinaryLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new(const guint8 *value, + gsize size) +{ + auto gandiva_node = + gandiva::TreeExprBuilder::MakeBinaryLiteral(std::string(reinterpret_cast(value), + size)); + return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_binary_literal_node_new_bytes: + * @value: The value of the binary literal. + * + * Returns: A newly created #GGandivaBinaryLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new_bytes(GBytes *value) +{ + size_t value_size; + auto raw_value = g_bytes_get_data(value, &value_size); + auto gandiva_node = + gandiva::TreeExprBuilder::MakeBinaryLiteral( + std::string(reinterpret_cast(raw_value), + value_size)); + auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node, + NULL); + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(literal_node); + priv->value = value; + g_bytes_ref(priv->value); + return GGANDIVA_BINARY_LITERAL_NODE(literal_node); +} + +/** + * ggandiva_binary_literal_node_get_value: + * @node: A #GGandivaBinaryLiteralNode. + * + * Returns: (transfer none): The value of the binary literal. + * + * Since: 0.12.0 + */ +GBytes * +ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node) +{ + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(node); + if (!priv->value) { + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + priv->value = g_bytes_new(value.data(), value.size()); + } + + return priv->value; +} + + +G_DEFINE_TYPE(GGandivaStringLiteralNode, + ggandiva_string_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_string_literal_node_init(GGandivaStringLiteralNode *string_literal_node) +{ +} + +static void +ggandiva_string_literal_node_class_init(GGandivaStringLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_string_literal_node_new: + * @value: The value of the UTF-8 encoded string literal. + * + * Returns: A newly created #GGandivaStringLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaStringLiteralNode * +ggandiva_string_literal_node_new(const gchar *value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeStringLiteral(value); + return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); +} + +/** + * ggandiva_string_literal_node_get_value: + * @node: A #GGandivaStringLiteralNode. + * + * Returns: The value of the UTF-8 encoded string literal. + * + * Since: 0.12.0 + */ +const gchar * +ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node) +{ + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + return value.c_str(); +} + + +typedef struct GGandivaIfNodePrivate_ { + GGandivaNode *condition_node; + GGandivaNode *then_node; + GGandivaNode *else_node; +} GGandivaIfNodePrivate; + +enum { + PROP_CONDITION_NODE = 1, + PROP_THEN_NODE, + PROP_ELSE_NODE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GGandivaIfNode, + ggandiva_if_node, + GGANDIVA_TYPE_NODE) + +#define GGANDIVA_IF_NODE_GET_PRIVATE(object) \ + static_cast( \ + ggandiva_if_node_get_instance_private( \ + GGANDIVA_IF_NODE(object))) + +static void +ggandiva_if_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_IF_NODE_GET_PRIVATE(object); + + if (priv->condition_node) { + g_object_unref(priv->condition_node); + priv->condition_node = nullptr; + } + + if (priv->then_node) { + g_object_unref(priv->then_node); + priv->then_node = nullptr; + } + + if (priv->else_node) { + g_object_unref(priv->else_node); + priv->else_node = nullptr; + } + + G_OBJECT_CLASS(ggandiva_if_node_parent_class)->dispose(object); +} + +static void +ggandiva_if_node_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_IF_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONDITION_NODE: + priv->condition_node = GGANDIVA_NODE(g_value_dup_object(value)); + break; + case PROP_THEN_NODE: + priv->then_node = GGANDIVA_NODE(g_value_dup_object(value)); + break; + case PROP_ELSE_NODE: + priv->else_node = GGANDIVA_NODE(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_if_node_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_IF_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONDITION_NODE: + g_value_set_object(value, priv->condition_node); + break; + case PROP_THEN_NODE: + g_value_set_object(value, priv->then_node); + break; + case PROP_ELSE_NODE: + g_value_set_object(value, priv->else_node); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_if_node_init(GGandivaIfNode *if_node) +{ +} + +static void +ggandiva_if_node_class_init(GGandivaIfNodeClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = ggandiva_if_node_dispose; + gobject_class->set_property = ggandiva_if_node_set_property; + gobject_class->get_property = ggandiva_if_node_get_property; + + GParamSpec *spec; + spec = g_param_spec_object("condition-node", + "Condition node", + "The condition node", + GGANDIVA_TYPE_NODE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CONDITION_NODE, spec); + + spec = g_param_spec_object("then-node", + "Then node", + "The then node", + GGANDIVA_TYPE_NODE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_THEN_NODE, spec); + + spec = g_param_spec_object("else-node", + "Else node", + "The else node", + GGANDIVA_TYPE_NODE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_ELSE_NODE, spec); +} + +/** + * ggandiva_if_node_new: + * @condition_node: the node with the condition for if-else expression. + * @then_node: the node in case the condition node is true. + * @else_node: the node in case the condition node is false. + * @return_type: A #GArrowDataType. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaIfNode or %NULl on error. + * + * Since: 0.12.0 + */ +GGandivaIfNode * +ggandiva_if_node_new(GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type, + GError **error) +{ + if (!condition_node || !then_node || !else_node || !return_type) { + /* TODO: Improve error message to show which arguments are invalid. */ + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][if-literal-node][new] " + "all arguments must not NULL"); + return NULL; + } + auto gandiva_condition_node = ggandiva_node_get_raw(condition_node); + auto gandiva_then_node = ggandiva_node_get_raw(then_node); + auto gandiva_else_node = ggandiva_node_get_raw(else_node); + auto arrow_return_type = garrow_data_type_get_raw(return_type); + auto gandiva_node = gandiva::TreeExprBuilder::MakeIf(gandiva_condition_node, + gandiva_then_node, + gandiva_else_node, + arrow_return_type); + if (!gandiva_node) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][if-literal-node][new] " + "failed to create: if (<%s>) {<%s>} else {<%s>} -> <%s>", + gandiva_condition_node->ToString().c_str(), + gandiva_then_node->ToString().c_str(), + gandiva_else_node->ToString().c_str(), + arrow_return_type->ToString().c_str()); + return NULL; + } + return ggandiva_if_node_new_raw(&gandiva_node, + condition_node, + then_node, + else_node, + return_type); +} + +G_END_DECLS + +std::shared_ptr +ggandiva_node_get_raw(GGandivaNode *node) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(node); + return priv->node; +} + +GGandivaFieldNode * +ggandiva_field_node_new_raw(std::shared_ptr *gandiva_node, + GArrowField *field) +{ + auto arrow_return_type = (*gandiva_node)->return_type(); + auto return_type = garrow_field_get_data_type(field); + auto field_node = g_object_new(GGANDIVA_TYPE_FIELD_NODE, + "node", gandiva_node, + "field", field, + "return-type", return_type, + NULL); + return GGANDIVA_FIELD_NODE(field_node); +} + +GGandivaFunctionNode * +ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, + const gchar *name, + GList *parameters, + GArrowDataType *return_type) +{ + auto function_node = g_object_new(GGANDIVA_TYPE_FUNCTION_NODE, + "node", gandiva_node, + "name", name, + "return-type", return_type, + NULL); + auto priv = GGANDIVA_FUNCTION_NODE_GET_PRIVATE(function_node); + for (auto node = parameters; node; node = g_list_next(node)) { + auto parameter = GGANDIVA_NODE(node->data); + priv->parameters = g_list_prepend(priv->parameters, g_object_ref(parameter)); + } + priv->parameters = g_list_reverse(priv->parameters); + return GGANDIVA_FUNCTION_NODE(function_node); +} + +GGandivaLiteralNode * +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, + GArrowDataType *return_type) +{ + auto gandiva_literal_node = + std::static_pointer_cast(*gandiva_node); + + GGandivaLiteralNode *literal_node; + if (gandiva_literal_node->is_null()) { + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(GGANDIVA_TYPE_NULL_LITERAL_NODE, + "node", gandiva_node, + "return-type", return_type, + NULL)); + } else { + GType type; + + auto arrow_return_type = gandiva_literal_node->return_type(); + switch (arrow_return_type->id()) { + case arrow::Type::BOOL: + type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; + break; + case arrow::Type::type::UINT8: + type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; + break; + case arrow::Type::type::UINT16: + type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; + break; + case arrow::Type::type::UINT32: + type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; + break; + case arrow::Type::type::UINT64: + type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; + break; + case arrow::Type::type::INT8: + type = GGANDIVA_TYPE_INT8_LITERAL_NODE; + break; + case arrow::Type::type::INT16: + type = GGANDIVA_TYPE_INT16_LITERAL_NODE; + break; + case arrow::Type::type::INT32: + type = GGANDIVA_TYPE_INT32_LITERAL_NODE; + break; + case arrow::Type::type::INT64: + type = GGANDIVA_TYPE_INT64_LITERAL_NODE; + break; + case arrow::Type::type::FLOAT: + type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; + break; + case arrow::Type::type::DOUBLE: + type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; + break; + case arrow::Type::type::STRING: + type = GGANDIVA_TYPE_STRING_LITERAL_NODE; + break; + case arrow::Type::type::BINARY: + type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; + break; + default: + type = GGANDIVA_TYPE_LITERAL_NODE; + break; + } + + if (return_type) { + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + "return-type", return_type, + NULL)); + } else { + return_type = garrow_data_type_new_raw(&arrow_return_type); + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + "return-type", return_type, + NULL)); + g_object_unref(return_type); + } + } + + return literal_node; +} + +GGandivaIfNode * +ggandiva_if_node_new_raw(std::shared_ptr *gandiva_node, + GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type) +{ + auto if_node = g_object_new(GGANDIVA_TYPE_IF_NODE, + "node", gandiva_node, + "condition-node", condition_node, + "then-node", then_node, + "else-node", else_node, + "return-type", return_type, + NULL); + return GGANDIVA_IF_NODE(if_node); } diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index 98ab3afb6ae8f..ffcf41da10b21 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -35,6 +35,7 @@ struct _GGandivaNodeClass GObjectClass parent_class; }; + #define GGANDIVA_TYPE_FIELD_NODE (ggandiva_field_node_get_type()) G_DECLARE_DERIVABLE_TYPE(GGandivaFieldNode, ggandiva_field_node, @@ -67,4 +68,275 @@ ggandiva_function_node_new(const gchar *name, GList * ggandiva_function_node_get_parameters(GGandivaFunctionNode *node); + +#define GGANDIVA_TYPE_LITERAL_NODE (ggandiva_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaLiteralNode, + ggandiva_literal_node, + GGANDIVA, + LITERAL_NODE, + GGandivaNode) +struct _GGandivaLiteralNodeClass +{ + GGandivaNodeClass parent_class; +}; + + +#define GGANDIVA_TYPE_NULL_LITERAL_NODE (ggandiva_null_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaNullLiteralNode, + ggandiva_null_literal_node, + GGANDIVA, + NULL_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaNullLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaNullLiteralNode * +ggandiva_null_literal_node_new(GArrowDataType *return_type, + GError **error); + + +#define GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE (ggandiva_boolean_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaBooleanLiteralNode, + ggandiva_boolean_literal_node, + GGANDIVA, + BOOLEAN_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaBooleanLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaBooleanLiteralNode * +ggandiva_boolean_literal_node_new(gboolean value); +gboolean +ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node); + + +#define GGANDIVA_TYPE_INT8_LITERAL_NODE (ggandiva_int8_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt8LiteralNode, + ggandiva_int8_literal_node, + GGANDIVA, + INT8_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt8LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt8LiteralNode * +ggandiva_int8_literal_node_new(gint8 value); +gint8 +ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT8_LITERAL_NODE (ggandiva_uint8_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt8LiteralNode, + ggandiva_uint8_literal_node, + GGANDIVA, + UINT8_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt8LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt8LiteralNode * +ggandiva_uint8_literal_node_new(guint8 value); +guint8 +ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node); + + +#define GGANDIVA_TYPE_INT16_LITERAL_NODE (ggandiva_int16_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt16LiteralNode, + ggandiva_int16_literal_node, + GGANDIVA, + INT16_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt16LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt16LiteralNode * +ggandiva_int16_literal_node_new(gint16 value); +gint16 +ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT16_LITERAL_NODE (ggandiva_uint16_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16LiteralNode, + ggandiva_uint16_literal_node, + GGANDIVA, + UINT16_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt16LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt16LiteralNode * +ggandiva_uint16_literal_node_new(guint16 value); +guint16 +ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node); + + +#define GGANDIVA_TYPE_INT32_LITERAL_NODE (ggandiva_int32_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt32LiteralNode, + ggandiva_int32_literal_node, + GGANDIVA, + INT32_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt32LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt32LiteralNode * +ggandiva_int32_literal_node_new(gint32 value); +gint32 +ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT32_LITERAL_NODE (ggandiva_uint32_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32LiteralNode, + ggandiva_uint32_literal_node, + GGANDIVA, + UINT32_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt32LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt32LiteralNode * +ggandiva_uint32_literal_node_new(guint32 value); +guint32 +ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node); + + +#define GGANDIVA_TYPE_INT64_LITERAL_NODE (ggandiva_int64_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt64LiteralNode, + ggandiva_int64_literal_node, + GGANDIVA, + INT64_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt64LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt64LiteralNode * +ggandiva_int64_literal_node_new(gint64 value); +gint64 +ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT64_LITERAL_NODE (ggandiva_uint64_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64LiteralNode, + ggandiva_uint64_literal_node, + GGANDIVA, + UINT64_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt64LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt64LiteralNode * +ggandiva_uint64_literal_node_new(guint64 value); +guint64 +ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node); + + +#define GGANDIVA_TYPE_FLOAT_LITERAL_NODE (ggandiva_float_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaFloatLiteralNode, + ggandiva_float_literal_node, + GGANDIVA, + FLOAT_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaFloatLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaFloatLiteralNode * +ggandiva_float_literal_node_new(gfloat value); +gfloat +ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node); + + +#define GGANDIVA_TYPE_DOUBLE_LITERAL_NODE (ggandiva_double_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaDoubleLiteralNode, + ggandiva_double_literal_node, + GGANDIVA, + DOUBLE_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaDoubleLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaDoubleLiteralNode * +ggandiva_double_literal_node_new(gdouble value); +gdouble +ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node); + + +#define GGANDIVA_TYPE_BINARY_LITERAL_NODE (ggandiva_binary_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaBinaryLiteralNode, + ggandiva_binary_literal_node, + GGANDIVA, + BINARY_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaBinaryLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new(const guint8 *value, + gsize size); +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new_bytes(GBytes *value); +GBytes * +ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node); + + +#define GGANDIVA_TYPE_STRING_LITERAL_NODE (ggandiva_string_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaStringLiteralNode, + ggandiva_string_literal_node, + GGANDIVA, + STRING_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaStringLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaStringLiteralNode * +ggandiva_string_literal_node_new(const gchar *value); +const gchar * +ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node); + + +#define GGANDIVA_TYPE_IF_NODE (ggandiva_if_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaIfNode, + ggandiva_if_node, + GGANDIVA, + IF_NODE, + GGandivaNode) +struct _GGandivaIfNodeClass +{ + GGandivaNodeClass parent_class; +}; + +GGandivaIfNode * +ggandiva_if_node_new(GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type, + GError **error); + G_END_DECLS diff --git a/c_glib/gandiva-glib/node.hpp b/c_glib/gandiva-glib/node.hpp index 953c214beb9d6..9a6ae98058699 100644 --- a/c_glib/gandiva-glib/node.hpp +++ b/c_glib/gandiva-glib/node.hpp @@ -21,6 +21,7 @@ #include +#include #include #include @@ -34,3 +35,12 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, const gchar *name, GList *parameters, GArrowDataType *return_type); +GGandivaLiteralNode * +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, + GArrowDataType *return_type); +GGandivaIfNode * +ggandiva_if_node_new_raw(std::shared_ptr *gandiva_node, + GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type); diff --git a/c_glib/meson.build b/c_glib/meson.build index 194421c13d316..7113534ec5915 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,8 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -python = find_program('python', 'python3', 'python2') -version = run_command(python, 'tool/get-version.py').stdout().strip() +version = '0.13.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp index 398e85b02c08a..5c16e827fc14b 100644 --- a/c_glib/parquet-glib/arrow-file-reader.cpp +++ b/c_glib/parquet-glib/arrow-file-reader.cpp @@ -310,8 +310,8 @@ gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, return NULL; } - std::shared_ptr arrow_array; - status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_array); + std::shared_ptr arrow_chunked_array; + status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_chunked_array); if (!garrow_error_check(error, status, "[parquet][arrow][file-reader][read-column]")) { @@ -319,7 +319,7 @@ gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, } auto arrow_field = arrow_schema->field(0); - auto arrow_column = std::make_shared(arrow_field, arrow_array); + auto arrow_column = std::make_shared(arrow_field, arrow_chunked_array); return garrow_column_new_raw(&arrow_column); } diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index e88cb13e83cd0..2038ea61f042a 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -39,13 +39,140 @@ G_BEGIN_DECLS * @title: Client related classes * @include: plasma-glib/plasma-glib.h * + * #GPlasmaClientOptions is a class for customizing plasma store + * connection. + * * #GPlasmaClientCreateOptions is a class for customizing object creation. * - * #GPlasmaClient is a class for an interface with a plasma store - * and a plasma manager. + * #GPlasmaClient is a class for an interface with a plasma store. + * + * Since: 0.12.0 + */ + +typedef struct GPlasmaClientCreatePrivate_ { + gint n_retries; +} GPlasmaClientOptionsPrivate; + +enum { + PROP_N_RETRIES = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GPlasmaClientOptions, + gplasma_client_options, + G_TYPE_OBJECT) + +#define GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + gplasma_client_options_get_instance_private( \ + GPLASMA_CLIENT_OPTIONS(object))) + +static void +gplasma_client_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_N_RETRIES: + priv->n_retries = g_value_get_int(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_client_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_N_RETRIES: + g_value_set_int(value, priv->n_retries); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_client_options_init(GPlasmaClientOptions *object) +{ +} + +static void +gplasma_client_options_class_init(GPlasmaClientOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = gplasma_client_options_set_property; + gobject_class->get_property = gplasma_client_options_get_property; + + GParamSpec *spec; + spec = g_param_spec_int("n-retries", + "N retries", + "The number of retries to connect plasma store. " + "-1 means that the system default value is used.", + -1, + G_MAXINT, + -1, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT)); + g_object_class_install_property(gobject_class, PROP_N_RETRIES, spec); +} + +/** + * gplasma_client_options_new: + * + * Returns: A newly created #GPlasmaClientOptions. + * + * Since: 0.12.0 + */ +GPlasmaClientOptions * +gplasma_client_options_new(void) +{ + auto options = g_object_new(GPLASMA_TYPE_CLIENT_OPTIONS, + NULL); + return GPLASMA_CLIENT_OPTIONS(options); +} + +/** + * gplasma_client_options_set_n_retries: + * @options: A #GPlasmaClientOptions. + * @n_retries: The number of retires on connect. + * + * Since: 0.12.0 + */ +void +gplasma_client_options_set_n_retries(GPlasmaClientOptions *options, + gint n_retries) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(options); + priv->n_retries = n_retries; +} + +/** + * gplasma_client_options_get_n_retries: + * @options: A #GPlasmaClientOptions. + * + * Returns: The number of retries on connect. * * Since: 0.12.0 */ +gint +gplasma_client_options_get_n_retries(GPlasmaClientOptions *options) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(options); + return priv->n_retries; +} + typedef struct GPlasmaClientCreateOptionsPrivate_ { guint8 *metadata; @@ -183,8 +310,10 @@ gplasma_client_create_options_get_metadata(GPlasmaClientCreateOptions *options, return priv->metadata; } + typedef struct GPlasmaClientPrivate_ { plasma::PlasmaClient *client; + bool disconnected; } GPlasmaClientPrivate; enum { @@ -205,10 +334,12 @@ gplasma_client_finalize(GObject *object) { auto priv = GPLASMA_CLIENT_GET_PRIVATE(object); - auto status = priv->client->Disconnect(); - if (!status.ok()) { - g_warning("[plasma][client][finalize] Failed to disconnect: %s", - status.ToString().c_str()); + if (!priv->disconnected) { + auto status = priv->client->Disconnect(); + if (!status.ok()) { + g_warning("[plasma][client][finalize] Failed to disconnect: %s", + status.ToString().c_str()); + } } delete priv->client; @@ -260,6 +391,7 @@ gplasma_client_class_init(GPlasmaClientClass *klass) /** * gplasma_client_new: * @store_socket_name: The name of the UNIX domain socket. + * @options: (nullable): The options to custom how to connect to plasma store. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable): A newly created #GPlasmaClient on success, @@ -269,10 +401,15 @@ gplasma_client_class_init(GPlasmaClientClass *klass) */ GPlasmaClient * gplasma_client_new(const gchar *store_socket_name, + GPlasmaClientOptions *options, GError **error) { auto plasma_client = new plasma::PlasmaClient(); - auto status = plasma_client->Connect(store_socket_name, ""); + int n_retries = -1; + if (options) { + n_retries = gplasma_client_options_get_n_retries(options); + } + auto status = plasma_client->Connect(store_socket_name, "", 0, n_retries); if (garrow_error_check(error, status, "[plasma][client][new]")) { return gplasma_client_new_raw(plasma_client); } else { @@ -431,6 +568,29 @@ gplasma_client_refer_object(GPlasmaClient *client, } } +/** + * gplasma_client_disconnect: + * @client: A #GPlasmaClient. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +gplasma_client_disconnect(GPlasmaClient *client, + GError **error) +{ + auto priv = GPLASMA_CLIENT_GET_PRIVATE(client); + auto status = priv->client->Disconnect(); + if (garrow_error_check(error, status, "[plasma][client][disconnect]")) { + priv->disconnected = true; + return TRUE; + } else { + return FALSE; + } +} + G_END_DECLS GPlasmaClient * diff --git a/c_glib/plasma-glib/client.h b/c_glib/plasma-glib/client.h index 6f99f467c83a7..2cb983e14e970 100644 --- a/c_glib/plasma-glib/client.h +++ b/c_glib/plasma-glib/client.h @@ -23,6 +23,26 @@ G_BEGIN_DECLS +#define GPLASMA_TYPE_CLIENT_OPTIONS (gplasma_client_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaClientOptions, + gplasma_client_options, + GPLASMA, + CLIENT_OPTIONS, + GObject) + +struct _GPlasmaClientOptionsClass +{ + GObjectClass parent_class; +}; + +GPlasmaClientOptions *gplasma_client_options_new(void); +void +gplasma_client_options_set_n_retries(GPlasmaClientOptions *options, + gint n_retries); +gint +gplasma_client_options_get_n_retries(GPlasmaClientOptions *options); + + #define GPLASMA_TYPE_CLIENT_CREATE_OPTIONS \ (gplasma_client_create_options_get_type()) G_DECLARE_DERIVABLE_TYPE(GPlasmaClientCreateOptions, @@ -59,6 +79,7 @@ struct _GPlasmaClientClass }; GPlasmaClient *gplasma_client_new(const gchar *store_socket_name, + GPlasmaClientOptions *options, GError **error); GPlasmaCreatedObject * gplasma_client_create(GPlasmaClient *client, @@ -71,5 +92,7 @@ gplasma_client_refer_object(GPlasmaClient *client, GPlasmaObjectID *id, gint64 timeout_ms, GError **error); +gboolean gplasma_client_disconnect(GPlasmaClient *client, + GError **error); G_END_DECLS diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in index f3a82c237d0b9..c82fe69580f1f 100644 --- a/c_glib/plasma-glib/plasma-glib.pc.in +++ b/c_glib/plasma-glib/plasma-glib.pc.in @@ -25,4 +25,4 @@ Description: C API for Apache Arrow Plasma based on GLib Version: @VERSION@ Libs: -L${libdir} -lplasma-glib Cflags: -I${includedir} -Requires: plasma arrow-glib @ARROW_GPU_GLIB_PACKAGE@ +Requires: plasma arrow-glib @ARROW_CUDA_GLIB_PACKAGE@ diff --git a/c_glib/test/gandiva/test-binary-literal-node.rb b/c_glib/test/gandiva/test-binary-literal-node.rb new file mode 100644 index 0000000000000..fddf74830d4ab --- /dev/null +++ b/c_glib/test/gandiva/test-binary-literal-node.rb @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaBinaryLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = "\x00\x01\x02\x03\x04" + end + + sub_test_case(".new") do + def test_string + node = Gandiva::BinaryLiteralNode.new(@value) + assert_equal(@value, node.value.to_s) + end + + def test_bytes + bytes_value = GLib::Bytes.new(@value) + node = Gandiva::BinaryLiteralNode.new(bytes_value) + assert_equal(@value, node.value.to_s) + end + end + + sub_test_case("instance methods") do + def setup + super + @node = Gandiva::BinaryLiteralNode.new(@value) + end + + def test_return_type + assert_equal(Arrow::BinaryDataType.new, @node.return_type) + end + end +end diff --git a/c_glib/test/gandiva/test-boolean-literal-node.rb b/c_glib/test/gandiva/test-boolean-literal-node.rb new file mode 100644 index 0000000000000..6e18a76218595 --- /dev/null +++ b/c_glib/test/gandiva/test-boolean-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaBooleanLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = true + @node = Gandiva::BooleanLiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value?) + end + + def test_return_type + assert_equal(Arrow::BooleanDataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-double-literal-node.rb b/c_glib/test/gandiva/test-double-literal-node.rb new file mode 100644 index 0000000000000..27cc3aea23b32 --- /dev/null +++ b/c_glib/test/gandiva/test-double-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaDoubleLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = 1.5 + @node = Gandiva::DoubleLiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::DoubleDataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-field-node.rb b/c_glib/test/gandiva/test-field-node.rb index c5bfe6cfc9743..51db285bcc0bf 100644 --- a/c_glib/test/gandiva/test-field-node.rb +++ b/c_glib/test/gandiva/test-field-node.rb @@ -18,11 +18,15 @@ class TestGandivaFieldNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) + @node = Gandiva::FieldNode.new(@field) end def test_field - field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) - field_node = Gandiva::FieldNode.new(field) - assert_equal(field, field_node.field) + assert_equal(@field, @node.field) + end + + def test_return_type + assert_equal(@field.data_type, @node.return_type) end end diff --git a/c_glib/tool/get-version.py b/c_glib/test/gandiva/test-float-literal-node.rb old mode 100755 new mode 100644 similarity index 68% rename from c_glib/tool/get-version.py rename to c_glib/test/gandiva/test-float-literal-node.rb index aacea6da3e865..4a49eb37441d1 --- a/c_glib/tool/get-version.py +++ b/c_glib/test/gandiva/test-float-literal-node.rb @@ -1,5 +1,3 @@ -#!/usr/bin/env python -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,13 +15,18 @@ # specific language governing permissions and limitations # under the License. -import os -import re +class TestGandivaFloatLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = 1.5 + @node = Gandiva::FloatLiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end -root = os.environ.get("MESON_SOURCE_ROOT", ".") -pom_xml = os.path.join(root, "..", "java", "pom.xml") -with open(pom_xml) as pom: - version_tag = re.search('^ (.+)', - pom.read(), - re.MULTILINE) - print(version_tag.group(1)) + def test_return_type + assert_equal(Arrow::FloatDataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-if-node.rb b/c_glib/test/gandiva/test-if-node.rb new file mode 100644 index 0000000000000..b00359590905d --- /dev/null +++ b/c_glib/test/gandiva/test-if-node.rb @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaIfNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new) + field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new) + @then_node = Gandiva::FieldNode.new(field1) + @else_node = Gandiva::FieldNode.new(field2) + @return_type = Arrow::Int32DataType.new + @condition_node = Gandiva::FunctionNode.new("greater_than", + [@then_node, @else_node], + @return_type) + @if_node = Gandiva::IfNode.new(@condition_node, + @then_node, + @else_node, + @return_type) + end + + def test_readers + assert_equal([ + @condition_node, + @then_node, + @else_node, + @return_type + ], + [ + @if_node.condition_node, + @if_node.then_node, + @if_node.else_node, + @if_node.return_type + ]) + end +end diff --git a/c_glib/test/gandiva/test-int16-literal-node.rb b/c_glib/test/gandiva/test-int16-literal-node.rb new file mode 100644 index 0000000000000..f8e6b26849496 --- /dev/null +++ b/c_glib/test/gandiva/test-int16-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt16LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 15) + @node = Gandiva::Int16LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int16DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-int32-literal-node.rb b/c_glib/test/gandiva/test-int32-literal-node.rb new file mode 100644 index 0000000000000..3d1bf588cf7dc --- /dev/null +++ b/c_glib/test/gandiva/test-int32-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt32LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 31) + @node = Gandiva::Int32LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int32DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-int64-literal-node.rb b/c_glib/test/gandiva/test-int64-literal-node.rb new file mode 100644 index 0000000000000..b2ca3bf630b43 --- /dev/null +++ b/c_glib/test/gandiva/test-int64-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt64LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 63) + @node = Gandiva::Int64LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int64DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-int8-literal-node.rb b/c_glib/test/gandiva/test-int8-literal-node.rb new file mode 100644 index 0000000000000..8d917bd1b4dfe --- /dev/null +++ b/c_glib/test/gandiva/test-int8-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt8LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 7) + @node = Gandiva::Int8LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int8DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-null-literal-node.rb b/c_glib/test/gandiva/test-null-literal-node.rb new file mode 100644 index 0000000000000..ae14f3c15e411 --- /dev/null +++ b/c_glib/test/gandiva/test-null-literal-node.rb @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaNullLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_invalid_type + return_type = Arrow::NullDataType.new + message = + "[gandiva][null-literal-node][new] " + + "failed to create: <#{return_type}>" + assert_raise(Arrow::Error::Invalid.new(message)) do + Gandiva::NullLiteralNode.new(return_type) + end + end + + def test_return_type + return_type = Arrow::BooleanDataType.new + literal_node = Gandiva::NullLiteralNode.new(return_type) + assert_equal(return_type, literal_node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-string-literal-node.rb b/c_glib/test/gandiva/test-string-literal-node.rb new file mode 100644 index 0000000000000..8a397ab4d1a9b --- /dev/null +++ b/c_glib/test/gandiva/test-string-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaStringLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = "Hello" + @node = Gandiva::StringLiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::StringDataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-uint16-literal-node.rb b/c_glib/test/gandiva/test-uint16-literal-node.rb new file mode 100644 index 0000000000000..971da38881df6 --- /dev/null +++ b/c_glib/test/gandiva/test-uint16-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt16LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 16 - 1 + @node = Gandiva::UInt16LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt16DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-uint32-literal-node.rb b/c_glib/test/gandiva/test-uint32-literal-node.rb new file mode 100644 index 0000000000000..8fcab7fefad87 --- /dev/null +++ b/c_glib/test/gandiva/test-uint32-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt32LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 32 - 1 + @node = Gandiva::UInt32LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt32DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-uint64-literal-node.rb b/c_glib/test/gandiva/test-uint64-literal-node.rb new file mode 100644 index 0000000000000..d5afddcd75f44 --- /dev/null +++ b/c_glib/test/gandiva/test-uint64-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt64LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = 3 + @node = Gandiva::UInt64LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt64DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-uint8-literal-node.rb b/c_glib/test/gandiva/test-uint8-literal-node.rb new file mode 100644 index 0000000000000..8ce91d599f435 --- /dev/null +++ b/c_glib/test/gandiva/test-uint8-literal-node.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt8LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 8 - 1 + @node = Gandiva::UInt8LiteralNode.new(@value) + end + + def test_value + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt8DataType.new, @node.return_type) + end +end diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index d6d1ff89b6a3e..f3ae709512eeb 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -135,20 +135,20 @@ def append_to_builder(builder, value) data_type = builder.value_data_type case data_type when Arrow::ListDataType - builder.append + builder.append_value value_builder = builder.value_builder value.each do |v| append_to_builder(value_builder, v) end when Arrow::StructDataType - builder.append + builder.append_value value.each do |name, v| field_index = data_type.get_field_index(name) field_builder = builder.get_field_builder(field_index) append_to_builder(field_builder, v) end else - builder.append(value) + builder.append_value(value) end end end @@ -179,7 +179,7 @@ def build_array(builder, values) if value.nil? builder.append_null else - builder.append(value) + builder.append_value(value) end end builder.finish diff --git a/ci/travis_script_gandiva_cpp.sh b/c_glib/test/plasma/test-plasma-client-options.rb old mode 100755 new mode 100644 similarity index 69% rename from ci/travis_script_gandiva_cpp.sh rename to c_glib/test/plasma/test-plasma-client-options.rb index 4d0a9b7a6bac4..abe6fd3ce46ff --- a/ci/travis_script_gandiva_cpp.sh +++ b/c_glib/test/plasma/test-plasma-client-options.rb @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,17 +15,17 @@ # specific language governing permissions and limitations # under the License. -set -e - -source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh - -pushd $CPP_BUILD_DIR - -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva,unittest - -# not running in parallel, since some of them are benchmarks -PATH=$ARROW_BUILD_TYPE:$PATH ctest -VV -L gandiva,integ +class TestPlasmaClientOptions < Test::Unit::TestCase + include Helper::Omittable -popd + def setup + omit("Plasma is required") unless defined?(::Plasma) + @options = Plasma::ClientOptions.new + end -# TODO : Capture C++ coverage info + test("n_retries") do + assert_equal(-1, @options.n_retries) + @options.n_retries = 10 + assert_equal(10, @options.n_retries) + end +end diff --git a/c_glib/test/plasma/test-plasma-client.rb b/c_glib/test/plasma/test-plasma-client.rb index cbdce865f0132..a57d1fc5944e9 100644 --- a/c_glib/test/plasma/test-plasma-client.rb +++ b/c_glib/test/plasma/test-plasma-client.rb @@ -23,7 +23,11 @@ def setup omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start - @client = Plasma::Client.new(@store.socket_path) + @options = Plasma::ClientOptions.new + @client = Plasma::Client.new(@store.socket_path, @options) + @id = Plasma::ObjectID.new("Hello") + @data = "World" + @options = Plasma::ClientCreateOptions.new end def teardown @@ -34,10 +38,7 @@ def teardown def setup super - @id = Plasma::ObjectID.new("Hello") - @data = "World" @metadata = "Metadata" - @options = Plasma::ClientCreateOptions.new end test("no options") do @@ -84,4 +85,11 @@ def setup ]) end end + + test("#disconnect") do + @client.disconnect + assert_raise(Arrow::Error::Io) do + @client.create(@id, @data.bytesize, @options) + end + end end diff --git a/c_glib/test/plasma/test-plasma-created-object.rb b/c_glib/test/plasma/test-plasma-created-object.rb index 54d6774790abe..9025ff4ac22d9 100644 --- a/c_glib/test/plasma/test-plasma-created-object.rb +++ b/c_glib/test/plasma/test-plasma-created-object.rb @@ -21,7 +21,7 @@ def setup omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start - @client = Plasma::Client.new(@store.socket_path) + @client = Plasma::Client.new(@store.socket_path, nil) @id = Plasma::ObjectID.new("Hello") @data = "World" diff --git a/c_glib/test/plasma/test-plasma-referred-object.rb b/c_glib/test/plasma/test-plasma-referred-object.rb index f55c0b13c5603..a74641ed5dcd3 100644 --- a/c_glib/test/plasma/test-plasma-referred-object.rb +++ b/c_glib/test/plasma/test-plasma-referred-object.rb @@ -21,7 +21,7 @@ def setup omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start - @client = Plasma::Client.new(@store.socket_path) + @client = Plasma::Client.new(@store.socket_path, nil) @id = Plasma::ObjectID.new("Hello") @data = "World" diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index 12fba7346c36f..3befde3c7a9bb 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -42,7 +42,7 @@ def test_equal_range def test_is_null builder = Arrow::BooleanArrayBuilder.new builder.append_null - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal([true, false], array.length.times.collect {|i| array.null?(i)}) @@ -51,7 +51,7 @@ def test_is_null def test_is_valid builder = Arrow::BooleanArrayBuilder.new builder.append_null - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal([false, true], array.length.times.collect {|i| array.valid?(i)}) @@ -59,7 +59,7 @@ def test_is_valid def test_length builder = Arrow::BooleanArrayBuilder.new - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal(1, array.length) end @@ -75,10 +75,10 @@ def test_n_nulls def test_null_bitmap builder = Arrow::BooleanArrayBuilder.new builder.append_null - builder.append(true) - builder.append(false) + builder.append_value(true) + builder.append_value(false) builder.append_null - builder.append(false) + builder.append_value(false) array = builder.finish assert_equal(0b10110, array.null_bitmap.data.to_s.unpack("c*")[0]) end @@ -97,9 +97,9 @@ def test_value_type def test_slice builder = Arrow::BooleanArrayBuilder.new - builder.append(true) - builder.append(false) - builder.append(true) + builder.append_value(true) + builder.append_value(false) + builder.append_value(true) array = builder.finish sub_array = array.slice(1, 2) assert_equal([false, true], diff --git a/c_glib/test/test-binary-array.rb b/c_glib/test/test-binary-array.rb index 2dfd9cfbaaf14..0dcaf4eef60c5 100644 --- a/c_glib/test/test-binary-array.rb +++ b/c_glib/test/test-binary-array.rb @@ -32,7 +32,7 @@ def test_new def test_value data = "\x00\x01\x02" builder = Arrow::BinaryArrayBuilder.new - builder.append(data) + builder.append_value(data) array = builder.finish assert_equal(data, array.get_value(0).to_s) end @@ -41,8 +41,8 @@ def test_buffer data1 = "\x00\x01\x02" data2 = "\x03\x04\x05" builder = Arrow::BinaryArrayBuilder.new - builder.append(data1) - builder.append(data2) + builder.append_value(data1) + builder.append_value(data2) array = builder.finish assert_equal(data1 + data2, array.buffer.data.to_s) end @@ -51,8 +51,8 @@ def test_offsets_buffer data1 = "\x00\x01" data2 = "\x02\x03\x04" builder = Arrow::BinaryArrayBuilder.new - builder.append(data1) - builder.append(data2) + builder.append_value(data1) + builder.append_value(data2) array = builder.finish byte_per_offset = 4 assert_equal([0, 2, 5].pack("l*"), diff --git a/c_glib/test/test-boolean-array.rb b/c_glib/test/test-boolean-array.rb index ac07ec995ea32..e8c7e5efe2fc5 100644 --- a/c_glib/test/test-boolean-array.rb +++ b/c_glib/test/test-boolean-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::BooleanArrayBuilder.new - builder.append(true) - builder.append(false) - builder.append(true) + builder.append_value(true) + builder.append_value(false) + builder.append_value(true) array = builder.finish assert_equal([0b101].pack("C*"), array.buffer.data.to_s) end def test_value builder = Arrow::BooleanArrayBuilder.new - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal(true, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 3, 1) builder = Arrow::BooleanArrayBuilder.new - builder.append(true) - builder.append(false) - builder.append(true) + builder.append_value(true) + builder.append_value(false) + builder.append_value(true) array = builder.finish assert_equal([true, false, true], array.values) end diff --git a/c_glib/test/test-buffer-input-stream.rb b/c_glib/test/test-buffer-input-stream.rb index f5a0132d2da98..cb6a667b3b7c0 100644 --- a/c_glib/test/test-buffer-input-stream.rb +++ b/c_glib/test/test-buffer-input-stream.rb @@ -39,4 +39,12 @@ def test_align read_buffer = buffer_input_stream.read(3) assert_equal("rld", read_buffer.data.to_s) end + + def test_peek + buffer = Arrow::Buffer.new("Hello World") + buffer_input_stream = Arrow::BufferInputStream.new(buffer) + peeked_data = buffer_input_stream.peek(5) + assert_equal(buffer_input_stream.read(5).data.to_s, + peeked_data.to_s) + end end diff --git a/c_glib/test/test-cuda.rb b/c_glib/test/test-cuda.rb index 32d486ef8ba97..ae915307b70f0 100644 --- a/c_glib/test/test-cuda.rb +++ b/c_glib/test/test-cuda.rb @@ -58,7 +58,7 @@ def test_export Arrow = GI.load("Arrow") ArrowCUDA = GI.load("ArrowCUDA") -manager = ArrowCUDA::ADeviceManager.new +manager = ArrowCUDA::DeviceManager.new context = manager.get_context(0) serialized_handle = #{serialized_handle.to_s.dump} handle = ArrowCUDA::IPCMemoryHandle.new(serialized_handle) diff --git a/c_glib/test/test-date32-array.rb b/c_glib/test/test-date32-array.rb index f1425693f381e..09ef78650bd59 100644 --- a/c_glib/test/test-date32-array.rb +++ b/c_glib/test/test-date32-array.rb @@ -34,9 +34,9 @@ def test_buffer after_epoch = 17406 # 2017-08-28 builder = Arrow::Date32ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch].pack("l*"), array.buffer.data.to_s) @@ -46,7 +46,7 @@ def test_value after_epoch = 17406 # 2017-08-28 builder = Arrow::Date32ArrayBuilder.new - builder.append(after_epoch) + builder.append_value(after_epoch) array = builder.finish assert_equal(after_epoch, array.get_value(0)) end @@ -56,9 +56,9 @@ def test_values after_epoch = 17406 # 2017-08-28 builder = Arrow::Date32ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch], array.values) end diff --git a/c_glib/test/test-date64-array.rb b/c_glib/test/test-date64-array.rb index 1ea9f5a6a0545..4d9f189196fc8 100644 --- a/c_glib/test/test-date64-array.rb +++ b/c_glib/test/test-date64-array.rb @@ -34,9 +34,9 @@ def test_buffer after_epoch = 1503878400000 # 2017-08-28T00:00:00Z builder = Arrow::Date64ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch].pack("q*"), array.buffer.data.to_s) @@ -46,7 +46,7 @@ def test_value after_epoch = 1503878400000 # 2017-08-28T00:00:00Z builder = Arrow::Date64ArrayBuilder.new - builder.append(after_epoch) + builder.append_value(after_epoch) array = builder.finish assert_equal(after_epoch, array.get_value(0)) end @@ -56,9 +56,9 @@ def test_values after_epoch = 1503878400000 # 2017-08-28T00:00:00Z builder = Arrow::Date64ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch], array.values) end diff --git a/c_glib/test/test-decimal-array.rb b/c_glib/test/test-decimal128-array.rb similarity index 85% rename from c_glib/test/test-decimal-array.rb rename to c_glib/test/test-decimal128-array.rb index a65e10037659a..132ceb7788585 100644 --- a/c_glib/test/test-decimal-array.rb +++ b/c_glib/test/test-decimal128-array.rb @@ -15,21 +15,21 @@ # specific language governing permissions and limitations # under the License. -class TestDecimalArray < Test::Unit::TestCase +class TestDecimal128Array < Test::Unit::TestCase def test_format_value - data_type = Arrow::DecimalDataType.new(8,2) + data_type = Arrow::Decimal128DataType.new(8, 2) builder = Arrow::Decimal128ArrayBuilder.new(data_type) decimal = Arrow::Decimal128.new("23423445") - builder.append(decimal) + builder.append_value(decimal) array = builder.finish assert_equal("234234.45", array.format_value(0)) end def test_value - data_type = Arrow::DecimalDataType.new(8,2) + data_type = Arrow::Decimal128DataType.new(8, 2) builder = Arrow::Decimal128ArrayBuilder.new(data_type) decimal = Arrow::Decimal128.new("23423445") - builder.append(decimal) + builder.append_value(decimal) array = builder.finish assert_equal("234234.45", array.get_value(0).to_string_scale(array.value_data_type.scale)) diff --git a/c_glib/test/test-decimal-data-type.rb b/c_glib/test/test-decimal128-data-type.rb similarity index 80% rename from c_glib/test/test-decimal-data-type.rb rename to c_glib/test/test-decimal128-data-type.rb index 04bfe78f925c0..27a31e28309cd 100644 --- a/c_glib/test/test-decimal-data-type.rb +++ b/c_glib/test/test-decimal128-data-type.rb @@ -15,24 +15,24 @@ # specific language governing permissions and limitations # under the License. -class TestDecimalDataType < Test::Unit::TestCase +class TestDecimal128DataType < Test::Unit::TestCase def test_type - data_type = Arrow::DecimalDataType.new(2, 0) + data_type = Arrow::Decimal128DataType.new(2, 0) assert_equal(Arrow::Type::DECIMAL, data_type.id) end def test_to_s - data_type = Arrow::DecimalDataType.new(2, 0) + data_type = Arrow::Decimal128DataType.new(2, 0) assert_equal("decimal(2, 0)", data_type.to_s) end def test_precision - data_type = Arrow::DecimalDataType.new(8, 2) + data_type = Arrow::Decimal128DataType.new(8, 2) assert_equal(8, data_type.precision) end def test_scale - data_type = Arrow::DecimalDataType.new(8, 2) + data_type = Arrow::Decimal128DataType.new(8, 2) assert_equal(2, data_type.scale) end end diff --git a/c_glib/test/test-decimal.rb b/c_glib/test/test-decimal128.rb similarity index 56% rename from c_glib/test/test-decimal.rb rename to c_glib/test/test-decimal128.rb index 99f1912babfae..de9453cbe69cd 100644 --- a/c_glib/test/test-decimal.rb +++ b/c_glib/test/test-decimal128.rb @@ -106,4 +106,101 @@ def test_divide_zero decimal1.divide(decimal2) end end + + def test_equal + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(10) + other_decimal2 = Arrow::Decimal128.new(11) + assert_equal([ + true, + false, + ], + [ + decimal == other_decimal1, + decimal == other_decimal2, + ]) + end + + def test_not_equal + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(10) + other_decimal2 = Arrow::Decimal128.new(11) + assert_equal([ + false, + true, + ], + [ + decimal != other_decimal1, + decimal != other_decimal2, + ]) + end + + def test_less_than + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + true, + false, + false + ], + [ + decimal < other_decimal1, + decimal < other_decimal2, + decimal < decimal, + ]) + end + + def test_less_than_or_equal + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + true, + false, + true + ], + [ + decimal <= other_decimal1, + decimal <= other_decimal2, + decimal <= decimal + ]) + end + + def test_greater_than + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + false, + true, + false + ], + [ + decimal > other_decimal1, + decimal > other_decimal2, + decimal > decimal + ]) + end + + def test_greater_than_or_equal + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + false, + true, + true + ], + [ + decimal >= other_decimal1, + decimal >= other_decimal2, + decimal >= decimal + ]) + end end diff --git a/c_glib/test/test-dense-union-data-type.rb b/c_glib/test/test-dense-union-data-type.rb index 0d1295423ebbb..231767f8a5441 100644 --- a/c_glib/test/test-dense-union-data-type.rb +++ b/c_glib/test/test-dense-union-data-type.rb @@ -17,11 +17,19 @@ class TestDenseUnionDataType < Test::Unit::TestCase def setup - fields = [ - Arrow::Field.new("number", Arrow::Int32DataType.new), - Arrow::Field.new("text", Arrow::StringDataType.new), + @number_field_data_type = Arrow::Int32DataType.new + @text_field_data_type = Arrow::StringDataType.new + @field_data_types = [ + @number_field_data_type, + @text_field_data_type, ] - @data_type = Arrow::DenseUnionDataType.new(fields, [2, 9]) + @number_field = Arrow::Field.new("number", @number_field_data_type) + @text_field = Arrow::Field.new("text", @text_field_data_type) + @fields = [ + @number_field, + @text_field, + ] + @data_type = Arrow::DenseUnionDataType.new(@fields, [2, 9]) end def test_type @@ -32,4 +40,21 @@ def test_to_s assert_equal("union[dense]", @data_type.to_s) end + + def test_fields + assert_equal(@fields.zip(@field_data_types), + @data_type.fields.collect {|field| [field, field.data_type]}) + end + + def test_get_field + field = @data_type.get_field(0) + assert_equal([ + @fields[0], + @field_data_types[0], + ], + [ + field, + field.data_type, + ]) + end end diff --git a/c_glib/test/test-double-array.rb b/c_glib/test/test-double-array.rb index 1213a5dfe53d6..020ed8f079960 100644 --- a/c_glib/test/test-double-array.rb +++ b/c_glib/test/test-double-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::DoubleArrayBuilder.new - builder.append(-1.1) - builder.append(2.2) - builder.append(-4.4) + builder.append_value(-1.1) + builder.append_value(2.2) + builder.append_value(-4.4) array = builder.finish assert_equal([-1.1, 2.2, -4.4].pack("d*"), array.buffer.data.to_s) end def test_value builder = Arrow::DoubleArrayBuilder.new - builder.append(1.5) + builder.append_value(1.5) array = builder.finish assert_in_delta(1.5, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::DoubleArrayBuilder.new - builder.append(1.5) - builder.append(3) - builder.append(4.5) + builder.append_value(1.5) + builder.append_value(3) + builder.append_value(4.5) array = builder.finish assert_equal([1.5, 3.0, 4.5], array.values) end diff --git a/c_glib/test/test-fixed-size-binary-data-type.rb b/c_glib/test/test-fixed-size-binary-data-type.rb new file mode 100644 index 0000000000000..584fb3deec93d --- /dev/null +++ b/c_glib/test/test-fixed-size-binary-data-type.rb @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFixedSizeBinaryDataType < Test::Unit::TestCase + def setup + @byte_width = 10 + @data_type = Arrow::FixedSizeBinaryDataType.new(@byte_width) + end + + def test_type + assert_equal(Arrow::Type::FIXED_SIZE_BINARY, @data_type.id) + end + + def test_to_s + assert_equal("fixed_size_binary[10]", @data_type.to_s) + end + + def test_byte_width + assert_equal(@byte_width, @data_type.byte_width) + end + + def test_bit_width + assert_equal(@byte_width * 8, @data_type.bit_width) + end +end diff --git a/c_glib/test/test-float-array.rb b/c_glib/test/test-float-array.rb index c8e1b4d864c08..c2a71a0dd39db 100644 --- a/c_glib/test/test-float-array.rb +++ b/c_glib/test/test-float-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::FloatArrayBuilder.new - builder.append(-1.1) - builder.append(2.2) - builder.append(-4.4) + builder.append_value(-1.1) + builder.append_value(2.2) + builder.append_value(-4.4) array = builder.finish assert_equal([-1.1, 2.2, -4.4].pack("f*"), array.buffer.data.to_s) end def test_value builder = Arrow::FloatArrayBuilder.new - builder.append(1.5) + builder.append_value(1.5) array = builder.finish assert_in_delta(1.5, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::FloatArrayBuilder.new - builder.append(1.5) - builder.append(3) - builder.append(4.5) + builder.append_value(1.5) + builder.append_value(3) + builder.append_value(4.5) array = builder.finish assert_equal([1.5, 3.0, 4.5], array.values) end diff --git a/c_glib/test/test-int16-array.rb b/c_glib/test/test-int16-array.rb index 13646e0d5b818..e0efb68019b24 100644 --- a/c_glib/test/test-int16-array.rb +++ b/c_glib/test/test-int16-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::Int16ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("s*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int16ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::Int16ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-int32-array.rb b/c_glib/test/test-int32-array.rb index d1579a8eba881..9827e532bf154 100644 --- a/c_glib/test/test-int32-array.rb +++ b/c_glib/test/test-int32-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::Int32ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("l*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int32ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end def test_values builder = Arrow::Int32ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-int64-array.rb b/c_glib/test/test-int64-array.rb index 5d9c37a55c084..39a74d34e23fa 100644 --- a/c_glib/test/test-int64-array.rb +++ b/c_glib/test/test-int64-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::Int64ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("q*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int64ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end def test_values builder = Arrow::Int64ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-int8-array.rb b/c_glib/test/test-int8-array.rb index e17c10c53611e..46fe591a575c2 100644 --- a/c_glib/test/test-int8-array.rb +++ b/c_glib/test/test-int8-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::Int8ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("c*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int8ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end def test_values builder = Arrow::Int8ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-list-array.rb b/c_glib/test/test-list-array.rb index 14f84067ac525..271d32236acbd 100644 --- a/c_glib/test/test-list-array.rb +++ b/c_glib/test/test-list-array.rb @@ -38,14 +38,14 @@ def test_value builder = Arrow::ListArrayBuilder.new(data_type) value_builder = builder.value_builder - builder.append - value_builder.append(-29) - value_builder.append(29) + builder.append_value + value_builder.append_value(-29) + value_builder.append_value(29) - builder.append - value_builder.append(-1) - value_builder.append(0) - value_builder.append(1) + builder.append_value + value_builder.append_value(-1) + value_builder.append_value(0) + value_builder.append_value(1) array = builder.finish value = array.get_value(1) diff --git a/c_glib/test/test-list-data-type.rb b/c_glib/test/test-list-data-type.rb index aa6a8fa65fd8c..78df28a144aa3 100644 --- a/c_glib/test/test-list-data-type.rb +++ b/c_glib/test/test-list-data-type.rb @@ -16,21 +16,28 @@ # under the License. class TestListDataType < Test::Unit::TestCase + def setup + @field_data_type = Arrow::BooleanDataType.new + @field = Arrow::Field.new("enabled", @field_data_type) + @data_type = Arrow::ListDataType.new(@field) + end + def test_type - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - data_type = Arrow::ListDataType.new(field) - assert_equal(Arrow::Type::LIST, data_type.id) + assert_equal(Arrow::Type::LIST, @data_type.id) end def test_to_s - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - data_type = Arrow::ListDataType.new(field) - assert_equal("list", data_type.to_s) + assert_equal("list", @data_type.to_s) end - def test_value_field - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - data_type = Arrow::ListDataType.new(field) - assert_equal(field, data_type.value_field) + def test_field + assert_equal([ + @field, + @field_data_type, + ], + [ + @data_type.field, + @data_type.field.data_type, + ]) end end diff --git a/c_glib/test/test-sparse-union-data-type.rb b/c_glib/test/test-sparse-union-data-type.rb index ff4ce72c274a3..30e24f7a11c9b 100644 --- a/c_glib/test/test-sparse-union-data-type.rb +++ b/c_glib/test/test-sparse-union-data-type.rb @@ -17,11 +17,19 @@ class TestSparseUnionDataType < Test::Unit::TestCase def setup - fields = [ - Arrow::Field.new("number", Arrow::Int32DataType.new), - Arrow::Field.new("text", Arrow::StringDataType.new), + @number_field_data_type = Arrow::Int32DataType.new + @text_field_data_type = Arrow::StringDataType.new + @field_data_types = [ + @number_field_data_type, + @text_field_data_type, ] - @data_type = Arrow::SparseUnionDataType.new(fields, [2, 9]) + @number_field = Arrow::Field.new("number", @number_field_data_type) + @text_field = Arrow::Field.new("text", @text_field_data_type) + @fields = [ + @number_field, + @text_field, + ] + @data_type = Arrow::SparseUnionDataType.new(@fields, [2, 9]) end def test_type @@ -32,4 +40,21 @@ def test_to_s assert_equal("union[sparse]", @data_type.to_s) end + + def test_fields + assert_equal(@fields.zip(@field_data_types), + @data_type.fields.collect {|field| [field, field.data_type]}) + end + + def test_get_field + field = @data_type.get_field(0) + assert_equal([ + @fields[0], + @field_data_types[0], + ], + [ + field, + field.data_type, + ]) + end end diff --git a/c_glib/test/test-string-array.rb b/c_glib/test/test-string-array.rb index a9edb0ae49152..61459edbb8059 100644 --- a/c_glib/test/test-string-array.rb +++ b/c_glib/test/test-string-array.rb @@ -31,15 +31,15 @@ def test_new def test_value builder = Arrow::StringArrayBuilder.new - builder.append("Hello") + builder.append_value("Hello") array = builder.finish assert_equal("Hello", array.get_string(0)) end def test_buffer builder = Arrow::StringArrayBuilder.new - builder.append("Hello") - builder.append("World") + builder.append_value("Hello") + builder.append_value("World") array = builder.finish assert_equal("HelloWorld", array.buffer.data.to_s) end diff --git a/c_glib/test/test-struct-array.rb b/c_glib/test/test-struct-array.rb index 78760a9b30984..af7e299d8b7ce 100644 --- a/c_glib/test/test-struct-array.rb +++ b/c_glib/test/test-struct-array.rb @@ -58,13 +58,13 @@ def test_flatten data_type = Arrow::StructDataType.new(fields) builder = Arrow::StructArrayBuilder.new(data_type) - builder.append - builder.get_field_builder(0).append(-29) - builder.get_field_builder(1).append(true) + builder.append_value + builder.get_field_builder(0).append_value(-29) + builder.get_field_builder(1).append_value(true) - builder.append - builder.field_builders[0].append(2) - builder.field_builders[1].append(false) + builder.append_value + builder.field_builders[0].append_value(2) + builder.field_builders[1].append_value(false) array = builder.finish values = array.length.times.collect do |i| diff --git a/c_glib/test/test-struct-data-type.rb b/c_glib/test/test-struct-data-type.rb index ce94e41c70148..82ce19ec6a495 100644 --- a/c_glib/test/test-struct-data-type.rb +++ b/c_glib/test/test-struct-data-type.rb @@ -17,8 +17,14 @@ class TestStructDataType < Test::Unit::TestCase def setup - @enabled_field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - @message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + @enabled_field_data_type = Arrow::BooleanDataType.new + @message_field_data_type = Arrow::StringDataType.new + @field_data_types = [ + @enabled_field_data_type, + @message_field_data_type, + ] + @enabled_field = Arrow::Field.new("enabled", @enabled_field_data_type) + @message_field = Arrow::Field.new("message", @message_field_data_type) @fields = [@enabled_field, @message_field] @data_type = Arrow::StructDataType.new(@fields) end @@ -37,7 +43,8 @@ def test_n_fields end def test_fields - assert_equal(@fields, @data_type.fields) + assert_equal(@fields.zip(@field_data_types), + @data_type.fields.collect {|field| [field, field.data_type]}) end sub_test_case("#get_field") do @@ -52,6 +59,18 @@ def test_negative def test_over assert_equal(nil, @data_type.get_field(2)) end + + def test_data_type + field = @data_type.get_field(0) + assert_equal([ + @fields[0], + @field_data_types[0], + ], + [ + field, + field.data_type, + ]) + end end sub_test_case("#get_field_by_name") do @@ -64,9 +83,21 @@ def test_not_found assert_equal(nil, @data_type.get_field_by_name("nonexistent")) end + + def test_data_type + field = @data_type.get_field_by_name("enabled") + assert_equal([ + @enabled_field, + @enabled_field_data_type, + ], + [ + field, + field.data_type, + ]) + end end - sub_test_case("#get_field_by_name") do + sub_test_case("#get_field_index") do def test_found assert_equal(@fields.index(@enabled_field), @data_type.get_field_index("enabled")) diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index 4394ad1353e7d..871e0d7c5ffd4 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -17,21 +17,19 @@ class TestTable < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable sub_test_case(".new") do - def test_columns - fields = [ + def setup + @fields = [ Arrow::Field.new("visible", Arrow::BooleanDataType.new), Arrow::Field.new("valid", Arrow::BooleanDataType.new), ] - schema = Arrow::Schema.new(fields) - columns = [ - Arrow::Column.new(fields[0], build_boolean_array([true])), - Arrow::Column.new(fields[1], build_boolean_array([false])), - ] - table = Arrow::Table.new(schema, columns) + @schema = Arrow::Schema.new(@fields) + end - data = table.n_columns.times.collect do |i| + def dump_table(table) + table.n_columns.times.collect do |i| column = table.get_column(i) values = [] column.data.chunks.each do |chunk| @@ -44,11 +42,54 @@ def test_columns values, ] end + end + + def test_columns + columns = [ + Arrow::Column.new(@fields[0], build_boolean_array([true])), + Arrow::Column.new(@fields[1], build_boolean_array([false])), + ] + table = Arrow::Table.new(@schema, columns) assert_equal([ ["visible", [true]], ["valid", [false]], ], - data) + dump_table(table)) + end + + def test_arrays + require_gi_bindings(3, 3, 1) + arrays = [ + build_boolean_array([true]), + build_boolean_array([false]), + ] + table = Arrow::Table.new(@schema, arrays) + assert_equal([ + ["visible", [true]], + ["valid", [false]], + ], + dump_table(table)) + end + + def test_record_batches + require_gi_bindings(3, 3, 1) + record_batches = [ + build_record_batch({ + "visible" => build_boolean_array([true]), + "valid" => build_boolean_array([false]) + }), + build_record_batch({ + "visible" => build_boolean_array([false]), + "valid" => build_boolean_array([true]) + }), + ] + table = Arrow::Table.new(@schema, record_batches) + + assert_equal([ + ["visible", [true, false]], + ["valid", [false, true]], + ], + dump_table(table)) end end diff --git a/c_glib/test/test-uint16-array.rb b/c_glib/test/test-uint16-array.rb index 1362c8e7ff507..baa6934e4f4e2 100644 --- a/c_glib/test/test-uint16-array.rb +++ b/c_glib/test/test-uint16-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::UInt16ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("S*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt16ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::UInt16ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/c_glib/test/test-uint32-array.rb b/c_glib/test/test-uint32-array.rb index 01b3edb353ff2..b9efb4cf00403 100644 --- a/c_glib/test/test-uint32-array.rb +++ b/c_glib/test/test-uint32-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::UInt32ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("L*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt32ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::UInt32ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/c_glib/test/test-uint64-array.rb b/c_glib/test/test-uint64-array.rb index a002af269293c..b4275cefdd9b8 100644 --- a/c_glib/test/test-uint64-array.rb +++ b/c_glib/test/test-uint64-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::UInt64ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("Q*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt64ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::UInt64ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/c_glib/test/test-uint8-array.rb b/c_glib/test/test-uint8-array.rb index 9137e53be70e5..08dfb3064cccb 100644 --- a/c_glib/test/test-uint8-array.rb +++ b/c_glib/test/test-uint8-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::UInt8ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("C*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt8ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end def test_values builder = Arrow::UInt8ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/ci/appveyor-build.bat b/ci/appveyor-build.bat index cfd451c5b896a..6e554199f08ea 100644 --- a/ci/appveyor-build.bat +++ b/ci/appveyor-build.bat @@ -22,7 +22,9 @@ if "%JOB%" == "Rust" ( ) else ( git config core.symlinks true git reset --hard - if "%JOB%"=="Cmake_Script_Tests" ( + if "%JOB:~,5%" == "MinGW" ( + call ci\appveyor-cpp-build-mingw.bat + ) else if "%JOB%" == "Cmake_Script_Tests" ( call ci\appveyor-cpp-test-cmake-script.bat ) else ( call ci\appveyor-cpp-build.bat diff --git a/ci/appveyor-cpp-build-mingw.bat b/ci/appveyor-cpp-build-mingw.bat new file mode 100644 index 0000000000000..4d3992745496a --- /dev/null +++ b/ci/appveyor-cpp-build-mingw.bat @@ -0,0 +1,61 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set CMAKE_BUILD_TYPE=release +set MESON_BUILD_TYPE=release + +set INSTALL_DIR=%HOMEDRIVE%%HOMEPATH%\install +set PATH=%INSTALL_DIR%\bin;%PATH% +set PKG_CONFIG_PATH=%INSTALL_DIR%\lib\pkgconfig + +set CPP_BUILD_DIR=cpp\build +mkdir %CPP_BUILD_DIR% +pushd %CPP_BUILD_DIR% + +set BOOST_ROOT=%MINGW_PREFIX% +set LZ4_HOME=%MINGW_PREFIX% +set ZSTD_HOME=%MINGW_PREFIX% +set SNAPPY_HOME=%MINGW_PREFIX% +set BROTLI_HOME=%MINGW_PREFIX% +set FLATBUFFERS_HOME=%MINGW_PREFIX% +cmake ^ + -G "MSYS Makefiles" ^ + -DCMAKE_INSTALL_PREFIX=%INSTALL_DIR% ^ + -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ + -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ + -DARROW_JEMALLOC=OFF ^ + -DARROW_USE_GLOG=OFF ^ + -DARROW_PYTHON=ON ^ + -DPythonInterp_FIND_VERSION=ON ^ + -DPythonInterp_FIND_VERSION_MAJOR=3 ^ + .. || exit /B +make -j4 || exit /B +make install || exit /B +popd + +set C_GLIB_BUILD_DIR=c_glib\build +meson ^ + setup ^ + --prefix=%INSTALL_DIR% ^ + --buildtype=%MESON_BUILD_TYPE% ^ + %C_GLIB_BUILD_DIR% ^ + c_glib || exit /B +sed -i'' -s 's/\r//g' %C_GLIB_BUILD_DIR%/arrow-glib/version.h || exit /B +ninja -C %C_GLIB_BUILD_DIR% || exit /B +ninja -C %C_GLIB_BUILD_DIR% install || exit /B diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 91212a63fe3ac..78f5e419289cf 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -34,6 +34,8 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=Debug ^ -DARROW_TEST_LINKAGE=static ^ -DARROW_CXXFLAGS="/MP" ^ @@ -51,6 +53,8 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=Release ^ -DARROW_TEST_LINKAGE=static ^ -DCMAKE_CXX_FLAGS_RELEASE="/MT %CMAKE_CXX_FLAGS_RELEASE%" ^ @@ -76,6 +80,8 @@ if "%JOB%" == "Build_Debug" ( cmake -G "%GENERATOR%" ^ -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_CXXFLAGS="/MP" ^ @@ -89,11 +95,12 @@ if "%JOB%" == "Build_Debug" ( exit /B 0 ) -conda create -n arrow -q -y ^ +conda create -n arrow -q -y -c conda-forge ^ + --file=ci\conda_env_python.yml ^ python=%PYTHON% ^ - six pytest setuptools numpy pandas cython ^ - thrift-cpp=0.11.0 boost-cpp ^ - -c conda-forge + numpy=1.14 ^ + thrift-cpp=0.11 ^ + boost-cpp call activate arrow @@ -103,9 +110,9 @@ set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib if "%JOB%" == "Toolchain" ( @rem Install pre-built "toolchain" packages for faster builds - conda install -q -y --file=ci\conda_env_cpp.yml ^ - python=%PYTHON% ^ - -c conda-forge + conda install -q -y -c conda-forge ^ + --file=ci\conda_env_cpp.yml ^ + python=%PYTHON% set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library ) diff --git a/ci/appveyor-cpp-setup-mingw.bat b/ci/appveyor-cpp-setup-mingw.bat new file mode 100644 index 0000000000000..471e7426f6e8f --- /dev/null +++ b/ci/appveyor-cpp-setup-mingw.bat @@ -0,0 +1,36 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set PATH=%MINGW_PREFIX%\bin;C:\msys64\usr\bin;%PATH% + +pacman -S --noconfirm ^ + "%MINGW_PACKAGE_PREFIX%-boost" ^ + "%MINGW_PACKAGE_PREFIX%-brotli" ^ + "%MINGW_PACKAGE_PREFIX%-cmake" ^ + "%MINGW_PACKAGE_PREFIX%-flatbuffers" ^ + "%MINGW_PACKAGE_PREFIX%-gcc" ^ + "%MINGW_PACKAGE_PREFIX%-gobject-introspection" ^ + "%MINGW_PACKAGE_PREFIX%-gtk-doc" ^ + "%MINGW_PACKAGE_PREFIX%-lz4" ^ + "%MINGW_PACKAGE_PREFIX%-meson" ^ + "%MINGW_PACKAGE_PREFIX%-protobuf" ^ + "%MINGW_PACKAGE_PREFIX%-python3-numpy" ^ + "%MINGW_PACKAGE_PREFIX%-snappy" ^ + "%MINGW_PACKAGE_PREFIX%-zlib" ^ + "%MINGW_PACKAGE_PREFIX%-zstd" || exit /B diff --git a/ci/appveyor-cpp-test-cmake-script.bat b/ci/appveyor-cpp-test-cmake-script.bat index 25bf9bddbbf39..415406c4ac366 100644 --- a/ci/appveyor-cpp-test-cmake-script.bat +++ b/ci/appveyor-cpp-test-cmake-script.bat @@ -32,6 +32,8 @@ set FLATBUFFERS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -49,6 +51,8 @@ set GFLAGS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -66,6 +70,8 @@ set SNAPPY_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -83,6 +89,8 @@ set ZLIB_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -100,6 +108,8 @@ set BROTLI_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -117,6 +127,8 @@ set LZ4_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -134,6 +146,8 @@ set ZSTD_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -158,6 +172,8 @@ pushd %BUILD_DIR% set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. 2>output.txt diff --git a/ci/appveyor-filter-changes.bat b/ci/appveyor-filter-changes.bat index e6f008f83a299..e78f91f53150c 100644 --- a/ci/appveyor-filter-changes.bat +++ b/ci/appveyor-filter-changes.bat @@ -22,6 +22,13 @@ if "%JOB%" == "Rust" ( echo === appveyor exit ) +) else if "%JOB%" == "MinGW" ( + if "%ARROW_CI_GLIB_AFFECTED%" == "0" ( + echo === + echo === No C++, or GLib changes, exiting job + echo === + appveyor exit + ) ) else ( if "%ARROW_CI_PYTHON_AFFECTED%" == "0" ( echo === diff --git a/ci/appveyor-install.bat b/ci/appveyor-install.bat index 483f262368656..3ab8f38f68c4a 100644 --- a/ci/appveyor-install.bat +++ b/ci/appveyor-install.bat @@ -25,7 +25,11 @@ if "%JOB%" == "Rust" ( rustup install nightly rustc -Vv cargo -V +) else if "%JOB:~,5%" == "MinGW" ( + call ci\appveyor-cpp-setup-mingw.bat ) else ( set "PATH=C:\Miniconda36-x64;C:\Miniconda36-x64\Scripts;C:\Miniconda36-x64\Library\bin;%PATH%" + set BOOST_ROOT=C:\Libraries\boost_1_67_0 + set BOOST_LIBRARYDIR=C:\Libraries\boost_1_67_0\lib64-msvc-14.0 call ci\appveyor-cpp-setup.bat ) diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml index 1e22e9017fc62..87523b3fdd611 100644 --- a/ci/conda_env_cpp.yml +++ b/ci/conda_env_cpp.yml @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. -boost-cpp +# ARROW-4056: The conda-forge boost 1.69.0 seems to break the Parquet unit +# tests with Xcode 8.3. Root cause not yet determined +boost-cpp=1.68.0 brotli bzip2 cmake diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index 429851eb2f5ae..b51f5c32f3297 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -16,11 +16,10 @@ # under the License. cython -nomkl +cloudpickle +hypothesis numpy pandas pytest -python -rsync setuptools setuptools_scm diff --git a/ci/conda_env_unix.yml b/ci/conda_env_unix.yml index eeb90e48dce72..9ecf549b504eb 100644 --- a/ci/conda_env_unix.yml +++ b/ci/conda_env_unix.yml @@ -18,3 +18,4 @@ # conda package dependencies specific to Unix-like environments (Linux and macOS) autoconf +rsync diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index ef961b2e0f26e..c36c6bd5c53d9 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -44,10 +44,13 @@ mkdir cpp\build pushd cpp\build cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ + -DCMAKE_VERBOSE_MAKEFILE=OFF ^ -DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ + -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ -DCMAKE_CXX_FLAGS_RELEASE="/MD %CMAKE_CXX_FLAGS_RELEASE%" ^ -DARROW_PARQUET=ON ^ @@ -55,7 +58,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ .. || exit /B cmake --build . --target install --config %CONFIGURATION% || exit /B -@rem Needed so python-test.exe works +@rem Needed so arrow-python-test.exe works set OLD_PYTHONHOME=%PYTHONHOME% set PYTHONHOME=%CONDA_PREFIX% @@ -70,7 +73,7 @@ popd pushd python -pip install pickle5 +pip install -r requirements.txt pickle5 set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS% set PYARROW_CMAKE_GENERATOR=%GENERATOR% @@ -112,6 +115,6 @@ pip install %WHEEL_PATH% || exit /B python -c "import pyarrow" || exit /B python -c "import pyarrow.parquet" || exit /B -pip install pandas pickle5 pytest pytest-faulthandler || exit /B +pip install pandas pickle5 pytest pytest-faulthandler hypothesis || exit /B py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B diff --git a/ci/detect-changes.py b/ci/detect-changes.py index e9a647c5e6d9c..102dc56396c45 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -26,7 +26,7 @@ perr = functools.partial(print, file=sys.stderr) -LANGUAGE_TOPICS = ['c_glib', 'cpp', 'go', 'java', 'js', 'python', +LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', 'r', 'ruby', 'rust'] ALL_TOPICS = LANGUAGE_TOPICS + ['integration', 'site', 'dev'] diff --git a/ci/docker_build_c_glib.sh b/ci/docker_build_c_glib.sh index 28ef9011f1e23..0135781f6ccb2 100755 --- a/ci/docker_build_c_glib.sh +++ b/ci/docker_build_c_glib.sh @@ -22,7 +22,7 @@ set -e export ARROW_C_GLIB_HOME=$CONDA_PREFIX export CFLAGS="-DARROW_NO_DEPRECATED_API" -export CXXFLAGS="-DARROW_NO_DEPRECATED_API -D_GLIBCXX_USE_CXX11_ABI=0" +export CXXFLAGS="-DARROW_NO_DEPRECATED_API" mkdir -p /build/c_glib diff --git a/ci/docker_build_cpp.sh b/ci/docker_build_cpp.sh index c6a46f22f714d..450dc870249b6 100755 --- a/ci/docker_build_cpp.sh +++ b/ci/docker_build_cpp.sh @@ -17,15 +17,11 @@ # under the License. set -e -set -o xtrace source_dir=${1:-/arrow/cpp} build_dir=${2:-/build/cpp} install_dir=${3:-${ARROW_HOME:-/usr/local}} -# https://arrow.apache.org/docs/python/development.html#known-issues -export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" - mkdir -p ${build_dir} pushd ${build_dir} diff --git a/ci/docker_build_java.sh b/ci/docker_build_java.sh new file mode 100755 index 0000000000000..0cbd00f816d06 --- /dev/null +++ b/ci/docker_build_java.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +# /arrow/java is read-only +mkdir -p /build/java + +arrow_src=/build/java/arrow + +pushd /arrow + rsync -a header java format integration $arrow_src +popd + +pushd $arrow_src/java + mvn -DskipTests -Drat.skip=true install +popd diff --git a/ci/docker_build_python.sh b/ci/docker_build_python.sh index 8ba8a1d66f1be..36b31b99737be 100755 --- a/ci/docker_build_python.sh +++ b/ci/docker_build_python.sh @@ -21,17 +21,31 @@ set -e source_dir=${1:-/arrow/python} build_dir=${2:-/build/python} -# For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues -export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja export PYARROW_BUILD_TYPE=${PYARROW_BUILD_TYPE:-debug} + +# Feature flags +export SETUPTOOLS_SCM_VERSION_WRITE_TO_PREFIX=$build_dir +export PYARROW_WITH_ORC=${PYARROW_WITH_ORC:-1} export PYARROW_WITH_PARQUET=${PYARROW_WITH_PARQUET:-1} export PYARROW_WITH_PLASMA=${PYARROW_WITH_PLASMA:-1} # Build pyarrow pushd ${source_dir} + # hacky again, setuptools_scm writes _generated_version.py before pyarrow + # directory is created by setuptools + mkdir -p $build_dir/pyarrow -python setup.py build_ext --build-temp=${build_dir} install + relative_build_dir=$(realpath --relative-to=. $build_dir) + # this is a nightmare, but prevents mutating the source directory + # which is bind mounted as readonly + python setup.py build_ext --build-temp $relative_build_dir \ + --build-lib $relative_build_dir \ + build_py --build-lib $relative_build_dir \ + egg_info --egg-base $relative_build_dir \ + install_lib --build-dir $relative_build_dir \ + install --single-version-externally-managed \ + --record $relative_build_dir/record.txt popd diff --git a/ci/docker_build_r.sh b/ci/docker_build_r.sh index 9fb95bc88cce3..6e676784aff16 100755 --- a/ci/docker_build_r.sh +++ b/ci/docker_build_r.sh @@ -21,10 +21,6 @@ set -e export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX -# For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues -export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -export PKG_CXXFLAGS=$CXXFLAGS - # Build arrow pushd /arrow/r diff --git a/ci/docker_build_sphinx.sh b/ci/docker_build_sphinx.sh index 957804325adf1..4a65f8155fb16 100755 --- a/ci/docker_build_sphinx.sh +++ b/ci/docker_build_sphinx.sh @@ -22,9 +22,7 @@ pushd /arrow/cpp/apidoc doxygen popd -pushd /arrow/python -python setup.py build_sphinx -s ../docs/source --build-dir ../docs/_build -popd +sphinx-build -b html /arrow/docs/source /arrow/docs/_build/html mkdir -p /arrow/site/asf-site/docs/latest rsync -r /arrow/docs/_build/html/ /arrow/site/asf-site/docs/latest/ diff --git a/ci/docker_install_conda.sh b/ci/docker_install_conda.sh index 427ee76e8e256..73c7162b98926 100755 --- a/ci/docker_install_conda.sh +++ b/ci/docker_install_conda.sh @@ -27,3 +27,16 @@ rm /tmp/miniconda.sh ln -s ${CONDA_PREFIX}/etc/profile.d/conda.sh /etc/profile.d/conda.sh echo ". ${CONDA_PREFIX}/etc/profile.d/conda.sh" >> ~/.bashrc echo "conda activate base" >> ~/.bashrc + +# Configure conda +source $MINICONDA/etc/profile.d/conda.sh +conda config --set show_channel_urls True + +# Help with SSL timeouts to S3 +conda config --set remote_connect_timeout_secs 12 + +# Setup conda-forge +conda config --add channels conda-forge + +# Update packages +conda update --all -q -y diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index c8a51fef6ec46..b36a97acf51ac 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -17,35 +17,18 @@ @rem The "main" Rust build script for Windows CI +@rem Retrieve git submodules, configure env var for Parquet unit tests +git submodule update --init || exit /B +set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data pushd rust -@echo =================================== -@echo Build with stable toolchain -@echo =================================== - -rustup default stable -rustup show -cargo build --target %TARGET% -cargo build --target %TARGET% --release -@echo Test (debug) -@echo ------------ -cargo test --target %TARGET% -@echo -@echo Test (release) -@echo -------------- -cargo test --target %TARGET% --release - @echo =================================== @echo Build with nightly toolchain @echo =================================== rustup default nightly rustup show -cargo build --target %TARGET% || exit /B cargo build --target %TARGET% --release || exit /B -@echo Test (debug) -@echo ------------ -cargo test --target %TARGET% || exit /B @echo @echo Test (release) @echo -------------- @@ -53,8 +36,10 @@ cargo test --target %TARGET% --release || exit /B @echo @echo Run example (release) @echo --------------------- +cd arrow cargo run --example builders --target %TARGET% --release || exit /B cargo run --example dynamic_types --target %TARGET% --release || exit /B cargo run --example read_csv --target %TARGET% --release || exit /B +cargo run --example read_csv_infer_schema --target %TARGET% --release || exit /B popd diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index 7cd1c2a064396..e8dd0cdc80d2e 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -44,22 +44,8 @@ gem install test-unit gobject-introspection if [ $TRAVIS_OS_NAME = "osx" ]; then sudo env PKG_CONFIG_PATH=$PKG_CONFIG_PATH luarocks install lgi else - if [ $BUILD_TORCH_EXAMPLE = "yes" ]; then - git clone \ - --quiet \ - --depth 1 \ - --recursive \ - https://github.com/torch/distro.git ~/torch - pushd ~/torch - ./install-deps > /dev/null - echo "yes" | ./install.sh > /dev/null - . ~/torch/install/bin/torch-activate - popd - luarocks install lgi - else - sudo apt install -y -qq luarocks - sudo luarocks install lgi - fi + sudo apt install -y -qq luarocks + sudo luarocks install lgi fi pushd $ARROW_C_GLIB_DIR diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index f9e0602a80971..76ae9a66e8100 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -40,8 +40,15 @@ if [ "$only_library_mode" == "no" ]; then source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh fi +if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then + # Set up C++ toolchain from conda-forge packages for faster builds + source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh +fi + +mkdir -p $ARROW_CPP_BUILD_DIR +pushd $ARROW_CPP_BUILD_DIR + CMAKE_COMMON_FLAGS="\ --DARROW_BUILD_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" @@ -49,26 +56,34 @@ CMAKE_LINUX_FLAGS="" CMAKE_OSX_FLAGS="" if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then - # Set up C++ toolchain from conda-forge packages for faster builds - source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh CMAKE_COMMON_FLAGS="${CMAKE_COMMON_FLAGS} -DARROW_JEMALLOC=ON" CMAKE_COMMON_FLAGS="${CMAKE_COMMON_FLAGS} -DARROW_WITH_BZ2=ON" fi -mkdir -p $ARROW_CPP_BUILD_DIR -pushd $ARROW_CPP_BUILD_DIR - if [ $only_library_mode == "yes" ]; then CMAKE_COMMON_FLAGS="\ $CMAKE_COMMON_FLAGS \ --DARROW_BUILD_TESTS=OFF \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_INSTALL_NAME_RPATH=OFF" +else + CMAKE_COMMON_FLAGS="\ +$CMAKE_COMMON_FLAGS \ +-DARROW_BUILD_BENCHMARKS=ON \ +-DARROW_BUILD_TESTS=ON \ +-DARROW_BUILD_EXAMPLES=ON \ +-DARROW_BUILD_UTILITIES=ON \ +-DARROW_INSTALL_NAME_RPATH=OFF" fi +ARROW_CXXFLAGS="" + # Use Ninja for faster builds when using toolchain if [ $ARROW_TRAVIS_USE_TOOLCHAIN == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -GNinja" + if [ "$DISTRO_CODENAME" != "trusty" ]; then + # Make sure the toolchain linker (from binutils package) is picked up by clang + ARROW_CXXFLAGS="$ARROW_CXXFLAGS -B$CPP_TOOLCHAIN/bin" + fi fi if [ $ARROW_TRAVIS_PLASMA == "1" ]; then @@ -92,6 +107,9 @@ fi if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" + if [ $ARROW_TRAVIS_GANDIVA_JAVA == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_JAVA=ON" + fi fi if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then @@ -106,15 +124,24 @@ if [ $ARROW_TRAVIS_VERBOSE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_VERBOSE_THIRDPARTY_BUILD=ON" fi -if [ $ARROW_TRAVIS_USE_VENDORED_BOOST == "1" ]; then +if [ $ARROW_TRAVIS_VENDORED_BOOST == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_VENDORED=ON" fi +if [ $ARROW_TRAVIS_STATIC_BOOST == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_USE_SHARED=OFF" +fi + +if [ $ARROW_TRAVIS_OPTIONAL_INSTALL == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_OPTIONAL_INSTALL=ON" +fi + if [ $TRAVIS_OS_NAME == "linux" ]; then cmake $CMAKE_COMMON_FLAGS \ $CMAKE_LINUX_FLAGS \ -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DBUILD_WARNING_LEVEL=$ARROW_BUILD_WARNING_LEVEL \ + -DARROW_CXXFLAGS="$ARROW_CXXFLAGS" \ $ARROW_CPP_DIR else if [ "$using_homebrew" = "yes" ]; then @@ -130,8 +157,10 @@ else $ARROW_CPP_DIR fi -# Build and install libraries -$TRAVIS_MAKE -j4 +# Build and install libraries. Configure ARROW_CPP_BUILD_TARGETS environment +# variable to only build certain targets. If you use this, you must also set +# the environment variable ARROW_TRAVIS_OPTIONAL_INSTALL=1 +$TRAVIS_MAKE -j4 $ARROW_CPP_BUILD_TARGETS $TRAVIS_MAKE install popd diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh index f5748b2a0452a..5f70535b42c6c 100755 --- a/ci/travis_env_common.sh +++ b/ci/travis_env_common.sh @@ -33,6 +33,8 @@ export ARROW_RUBY_DIR=$TRAVIS_BUILD_DIR/ruby export ARROW_RUST_DIR=${TRAVIS_BUILD_DIR}/rust export ARROW_R_DIR=${TRAVIS_BUILD_DIR}/r +export ARROW_TRAVIS_COVERAGE=${ARROW_TRAVIS_COVERAGE:=0} + if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then export ARROW_CPP_COVERAGE_FILE=${TRAVIS_BUILD_DIR}/coverage.info export ARROW_PYTHON_COVERAGE_FILE=${TRAVIS_BUILD_DIR}/.coverage @@ -71,3 +73,18 @@ if [ $TRAVIS_OS_NAME == "osx" ]; then fi export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/cpp/submodules/parquet-testing/data + +# e.g. "trusty" or "xenial" +if [ $TRAVIS_OS_NAME == "linux" ]; then + export DISTRO_CODENAME=`lsb_release -s -c` +fi + +if [ "$ARROW_TRAVIS_USE_SYSTEM_JAVA" == "1" ]; then + # Use the Ubuntu-provided OpenJDK + unset JAVA_HOME + export TRAVIS_MVN=/usr/bin/mvn + export TRAVIS_JAVA=/usr/bin/java +else + export TRAVIS_MVN=mvn + export TRAVIS_JAVA=java +fi diff --git a/ci/travis_install_cargo.sh b/ci/travis_install_cargo.sh index f433033091ce1..e4a6b3b3493f3 100755 --- a/ci/travis_install_cargo.sh +++ b/ci/travis_install_cargo.sh @@ -21,6 +21,7 @@ set -e # ensure that both toolchains are installed rustup install stable +rustup component add rustfmt rustup install nightly pip install 'travis-cargo<0.2' --user diff --git a/ci/travis_install_clang_tools.sh b/ci/travis_install_clang_tools.sh index 49b2e47762121..9e974db5fb7cc 100755 --- a/ci/travis_install_clang_tools.sh +++ b/ci/travis_install_clang_tools.sh @@ -17,8 +17,13 @@ # specific language governing permissions and limitations # under the License. -wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - + +set -ex + +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + +wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - sudo apt-add-repository -y \ - "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-6.0 main" + "deb https://apt.llvm.org/$DISTRO_CODENAME/ llvm-toolchain-$DISTRO_CODENAME-6.0 main" sudo apt-get update -qq sudo apt-get install -q clang-6.0 clang-format-6.0 clang-tidy-6.0 diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index ade6392ce24a2..49a2f21ef6793 100755 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -67,7 +67,6 @@ else # Help with SSL timeouts to S3 conda config --set remote_connect_timeout_secs 12 - conda config --add channels https://repo.continuum.io/pkgs/free conda config --add channels conda-forge fi diff --git a/ci/travis_install_linux.sh b/ci/travis_install_linux.sh index 98d9bdd924bfa..b8fe63a3ff4bc 100755 --- a/ci/travis_install_linux.sh +++ b/ci/travis_install_linux.sh @@ -17,25 +17,43 @@ # specific language governing permissions and limitations # under the License. -sudo apt-get install -y -q \ +set -e + +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + +sudo apt-get install -y -qq \ gdb binutils ccache libboost-dev libboost-filesystem-dev \ libboost-system-dev libboost-regex-dev if [ "$CXX" == "g++-4.9" ]; then - sudo apt-get install -y -q g++-4.9 + sudo apt-get install -y -qq g++-4.9 fi if [ "$ARROW_TRAVIS_VALGRIND" == "1" ]; then - sudo apt-get install -y -q valgrind + sudo apt-get install -y -qq valgrind fi if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then - sudo apt-get install -y -q lcov + sudo apt-get install -y -qq lcov fi -if [ "$ARROW_TRAVIS_GANDIVA" == "1" -a "$ARROW_USE_TOOLCHAIN" != "1" ]; then - sudo add-apt-repository -y ppa:dluxen/cmake-backports - sudo apt-get update -q - sudo apt-get install -y -q cmake3 - sudo rm -rf /usr/local/cmake-* +set -x +if [ "$DISTRO_CODENAME" != "trusty" ]; then + if [ "$ARROW_TRAVIS_GANDIVA" == "1" ]; then + sudo apt-get install -y -qq llvm-6.0-dev + fi + + sudo apt-get install -y -qq maven + + # Remove Travis-specific versions of Java + sudo rm -rf /usr/local/lib/jvm* + sudo rm -rf /usr/local/maven* + hash -r + unset JAVA_HOME + + which java + which mvn + java -version + mvn -v fi + diff --git a/ci/travis_install_osx.sh b/ci/travis_install_osx.sh index 47d6a637f7d58..6b6a4b2533d8b 100755 --- a/ci/travis_install_osx.sh +++ b/ci/travis_install_osx.sh @@ -23,13 +23,25 @@ set -e if [ "$ARROW_CI_RUBY_AFFECTED" = "1" ]; then brew_log_path=brew.log function run_brew() { - echo brew "$@" >> ${brew_log_path} - if ! gtimeout --signal=KILL 5m brew "$@" >> ${brew_log_path} 2>&1; then - cat ${brew_log_path} - rm ${brew_log_path} - false - fi + local i=0 + local n_tries=3 + while [[ $((i++)) < ${n_tries} ]]; do + echo "${i}: brew" "$@" >> ${brew_log_path} + if gtimeout --signal=KILL 9m brew "$@" >> ${brew_log_path} 2>&1; then + break + elif [[ ${i} == ${n_tries} ]]; then + cat ${brew_log_path} + rm ${brew_log_path} + false + fi + done } + + # ARROW-3976 Old versions of git can cause failures when Homebrew prints a + # donation solicitation. Attempt to update git + git --version + run_brew upgrade git + run_brew update run_brew upgrade python run_brew uninstall postgis diff --git a/ci/travis_install_toolchain.sh b/ci/travis_install_toolchain.sh index 86ac56d043b96..7ba1f79e009b2 100755 --- a/ci/travis_install_toolchain.sh +++ b/ci/travis_install_toolchain.sh @@ -22,16 +22,32 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh if [ ! -e $CPP_TOOLCHAIN ]; then + CONDA_PACKAGES="" + CONDA_LABEL="" + if [ $ARROW_TRAVIS_GANDIVA == "1" ] && [ $TRAVIS_OS_NAME == "osx" ]; then - CONDA_LLVM="llvmdev=6.0.1" + CONDA_PACKAGES="$CONDA_PACKAGES llvmdev=6.0.1" + fi + + if [ $TRAVIS_OS_NAME == "linux" ]; then + if [ "$DISTRO_CODENAME" == "trusty" ]; then + CONDA_LABEL=" -c conda-forge/label/cf201901" + else + # Use newer binutils when linking against conda-provided libraries + CONDA_PACKAGES="$CONDA_PACKAGES binutils" + fi + fi + + if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then + # Use newer Valgrind + CONDA_PACKAGES="$CONDA_PACKAGES valgrind" fi # Set up C++ toolchain from conda-forge packages for faster builds - conda create -y -q -p $CPP_TOOLCHAIN \ + conda create -y -q -p $CPP_TOOLCHAIN $CONDA_LABEL \ --file=$TRAVIS_BUILD_DIR/ci/conda_env_cpp.yml \ - ${CONDA_LLVM} \ + $CONDA_PACKAGES \ ccache \ - curl \ ninja \ nomkl \ python=3.6 diff --git a/ci/travis_script_c_glib.sh b/ci/travis_script_c_glib.sh index adecc5c742967..c42a047ddf445 100755 --- a/ci/travis_script_c_glib.sh +++ b/ci/travis_script_c_glib.sh @@ -32,19 +32,10 @@ arrow_c_glib_run_test() export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$arrow_c_glib_lib_dir/pkgconfig pushd example/lua - if [ "$BUILD_TORCH_EXAMPLE" = "yes" ]; then - . ~/torch/install/bin/torch-activate - luajit write-batch.lua - luajit read-batch.lua - luajit write-stream.lua - luajit read-stream.lua - luajit stream-to-torch-tensor.lua - else - lua write-batch.lua - lua read-batch.lua - lua write-stream.lua - lua read-stream.lua - fi + lua write-batch.lua + lua read-batch.lua + lua write-stream.lua + lua read-stream.lua popd } diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index b89e5b73bf00f..14529b03160f5 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -30,7 +30,7 @@ popd # Capture C++ coverage info (we wipe the build dir in travis_script_python.sh) if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then pushd $TRAVIS_BUILD_DIR - lcov --quiet --directory . --capture --no-external --output-file $ARROW_CPP_COVERAGE_FILE \ - 2>&1 | grep -v "WARNING: no data found for /usr/include" + lcov --directory . --capture --no-external --output-file $ARROW_CPP_COVERAGE_FILE \ + 2>&1 | grep -v "ignoring data for external file" popd fi diff --git a/ci/travis_script_gandiva_java.sh b/ci/travis_script_gandiva_java.sh index 1f188e7e91dd4..387be9a092b98 100755 --- a/ci/travis_script_gandiva_java.sh +++ b/ci/travis_script_gandiva_java.sh @@ -24,12 +24,10 @@ JAVA_DIR=${TRAVIS_BUILD_DIR}/java pushd $JAVA_DIR -export MAVEN_OPTS="$MAVEN_OPTS -Dorg.slf4j.simpleLogger.defaultLogLevel=warn" - # build with gandiva profile -mvn -P gandiva -B install -DskipTests -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR/debug +$TRAVIS_MVN -P gandiva -B install -DskipTests -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR/debug # run gandiva tests -mvn test -P gandiva -pl gandiva -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR/debug +$TRAVIS_MVN test -P gandiva -pl gandiva -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR/debug popd diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index 286acacd74004..02e2eae81509c 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -28,7 +28,7 @@ export ARROW_CPP_EXE_PATH=$ARROW_CPP_BUILD_DIR/debug pushd $ARROW_JAVA_DIR echo "mvn package" -mvn -B clean package 2>&1 > mvn_package.log || (cat mvn_package.log && false) +$TRAVIS_MVN -B clean package 2>&1 > mvn_package.log || (cat mvn_package.log && false) popd @@ -36,14 +36,14 @@ pushd $ARROW_JS_DIR # lint and compile JS source npm run lint -npm run build +npm run build -- -t apache-arrow popd pushd $ARROW_INTEGRATION_DIR CONDA_ENV_NAME=arrow-integration-test -conda create -y -q -n $CONDA_ENV_NAME python=3.5 +conda create -y -q -n $CONDA_ENV_NAME python=3.6 conda activate $CONDA_ENV_NAME # faster builds, please @@ -52,7 +52,12 @@ conda install -y nomkl # Expensive dependencies install from Continuum package repo conda install -y pip numpy six -python integration_test.py --debug +# ARROW-4008: Create a directory to write temporary files since /tmp can be +# unstable in Travis CI +INTEGRATION_TEMPDIR=$TRAVIS_BUILD_DIR/integration_temp +mkdir -p $INTEGRATION_TEMPDIR + +python integration_test.py --debug --tempdir=$INTEGRATION_TEMPDIR popd diff --git a/ci/travis_script_java.sh b/ci/travis_script_java.sh index 8a71fdc4d0064..201c336268792 100755 --- a/ci/travis_script_java.sh +++ b/ci/travis_script_java.sh @@ -19,15 +19,16 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + JAVA_DIR=${TRAVIS_BUILD_DIR}/java pushd $JAVA_DIR -export MAVEN_OPTS="$MAVEN_OPTS -Dorg.slf4j.simpleLogger.defaultLogLevel=warn" -if [ $ARROW_TRAVIS_JAVA_BUILD_ONLY == "1" ]; then +if [ "$ARROW_TRAVIS_JAVA_BUILD_ONLY" == "1" ]; then # Save time and make build less verbose by skipping tests and style checks - mvn -DskipTests=true -Dcheckstyle.skip=true -B install + $TRAVIS_MVN -DskipTests=true -Dcheckstyle.skip=true -B install else - mvn -B install + $TRAVIS_MVN -B install fi popd diff --git a/ci/travis_script_javadoc.sh b/ci/travis_script_javadoc.sh index ccfb2dab61d05..755d4628f205b 100755 --- a/ci/travis_script_javadoc.sh +++ b/ci/travis_script_javadoc.sh @@ -19,11 +19,13 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + JAVA_DIR=${TRAVIS_BUILD_DIR}/java pushd $JAVA_DIR export MAVEN_OPTS="$MAVEN_OPTS -Dorg.slf4j.simpleLogger.defaultLogLevel=warn" -mvn -B site +$TRAVIS_MVN -B site popd diff --git a/ci/travis_script_js.sh b/ci/travis_script_js.sh index 1871b4265cd01..34b07115e70b1 100755 --- a/ci/travis_script_js.sh +++ b/ci/travis_script_js.sh @@ -23,9 +23,10 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh pushd $ARROW_JS_DIR -npm run lint +npm run lint:ci npm run build -# run the non-snapshot unit tests npm test +npm run test:coverage +bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" popd diff --git a/ci/travis_script_plasma_java_client.sh b/ci/travis_script_plasma_java_client.sh index 927a2391201c1..0b291ed32a56d 100755 --- a/ci/travis_script_plasma_java_client.sh +++ b/ci/travis_script_plasma_java_client.sh @@ -23,16 +23,15 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh PLASMA_JAVA_DIR=${TRAVIS_BUILD_DIR}/java/plasma - pushd $PLASMA_JAVA_DIR -mvn clean install +$TRAVIS_MVN clean install export LD_LIBRARY_PATH=${ARROW_CPP_INSTALL}/lib:$LD_LIBRARY_PATH export PLASMA_STORE=${ARROW_CPP_INSTALL}/bin/plasma_store_server ldd $PLASMA_STORE -java -cp target/test-classes:target/classes -Djava.library.path=${TRAVIS_BUILD_DIR}/cpp-build/debug/ org.apache.arrow.plasma.PlasmaClientTest +$TRAVIS_JAVA -cp target/test-classes:target/classes -Djava.library.path=${TRAVIS_BUILD_DIR}/cpp-build/debug/ org.apache.arrow.plasma.PlasmaClientTest popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index e4290ed8ee026..60335d966846b 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -32,36 +32,36 @@ PYARROW_PYTEST_FLAGS=" -r sxX --durations=15 --parquet" PYTHON_VERSION=$1 CONDA_ENV_DIR=$TRAVIS_BUILD_DIR/pyarrow-test-$PYTHON_VERSION -conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl -conda activate $CONDA_ENV_DIR - # We should use zlib in the target Python directory to avoid loading # wrong libpython on macOS at run-time. If we use zlib in # $ARROW_BUILD_TOOLCHAIN and libpython3.6m.dylib exists in both -# $ARROW_BUILD_TOOLCHAIN and $CONDA_ENV_DIR, python-test uses +# $ARROW_BUILD_TOOLCHAIN and $CONDA_ENV_DIR, arrow-python-test uses # libpython3.6m.dylib on $ARROW_BUILD_TOOLCHAIN not $CONDA_ENV_DIR. # libpython3.6m.dylib on $ARROW_BUILD_TOOLCHAIN doesn't have NumPy. So # python-test fails. export ZLIB_HOME=$CONDA_ENV_DIR -python --version -which python - if [ $ARROW_TRAVIS_PYTHON_JVM == "1" ]; then CONDA_JVM_DEPS="jpype1" fi -conda install -y -q pip \ +conda create -y -q -p $CONDA_ENV_DIR \ + --file $TRAVIS_BUILD_DIR/ci/conda_env_python.yml \ nomkl \ - cloudpickle \ - numpy=1.13.1 \ - ${CONDA_JVM_DEPS} \ - pandas \ - cython + cmake \ + pip \ + numpy=1.14 \ + python=${PYTHON_VERSION} \ + ${CONDA_JVM_DEPS} + +conda activate $CONDA_ENV_DIR + +python --version +which python if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then # Install documentation dependencies - conda install -y -c conda-forge --file ci/conda_env_sphinx.yml + conda install -y --file ci/conda_env_sphinx.yml fi # ARROW-2093: PyTorch increases the size of our conda dependency stack @@ -74,7 +74,7 @@ fi # fi if [ $TRAVIS_OS_NAME != "osx" ]; then - conda install -y -c conda-forge tensorflow + conda install -y tensorflow PYARROW_PYTEST_FLAGS="$PYARROW_PYTEST_FLAGS --tensorflow" fi @@ -88,19 +88,23 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" +PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma parquet" + if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" fi if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=OFF" + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" + PYTHON_CPP_BUILD_TARGETS="$PYTHON_CPP_BUILD_TARGETS gandiva" fi cmake -GNinja \ $CMAKE_COMMON_FLAGS \ - -DARROW_BUILD_TESTS=on \ - -DARROW_TEST_INCLUDE_LABELS=python \ - -DARROW_BUILD_UTILITIES=off \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_OPTIONAL_INSTALL=ON \ + -DARROW_PARQUET=on \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ @@ -109,19 +113,16 @@ cmake -GNinja \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ $ARROW_CPP_DIR -ninja +ninja $PYTHON_CPP_BUILD_TARGETS ninja install popd # python-test isn't run by travis_script_cpp.sh, exercise it here -$ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/python-test +$ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/arrow-python-test pushd $ARROW_PYTHON_DIR -# Other stuff pip install -pip install -q -r requirements.txt - if [ "$PYTHON_VERSION" == "3.6" ]; then pip install -q pickle5 fi @@ -130,6 +131,9 @@ if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then pip install -q coverage fi +echo "=== pip list ===" +pip list + export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$ARROW_CPP_INSTALL/lib/pkgconfig export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE @@ -174,12 +178,11 @@ if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then coverage report -i --include="*/_parquet.pyx" # Generate XML file for CodeCov coverage xml -i -o $TRAVIS_BUILD_DIR/coverage.xml - # Capture C++ coverage info and combine with previous coverage file + # Capture C++ coverage info pushd $TRAVIS_BUILD_DIR - lcov --quiet --directory . --capture --no-external --output-file coverage-python-tests.info \ - 2>&1 | grep -v "WARNING: no data found for /usr/include" + lcov --directory . --capture --no-external --output-file coverage-python-tests.info \ + 2>&1 | grep -v "ignoring data for external file" lcov --add-tracefile coverage-python-tests.info \ - --add-tracefile $ARROW_CPP_COVERAGE_FILE \ --output-file $ARROW_CPP_COVERAGE_FILE rm coverage-python-tests.info popd # $TRAVIS_BUILD_DIR diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 02a32cdabe818..c25d64ec42cb6 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -19,6 +19,8 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + RUST_DIR=${TRAVIS_BUILD_DIR}/rust pushd $RUST_DIR @@ -26,22 +28,17 @@ pushd $RUST_DIR # show activated toolchain rustup show -# check code formatting only for Rust nightly -if [ $RUSTUP_TOOLCHAIN == "nightly" ] -then - # raises on any formatting errors - rustup component add rustfmt-preview - cargo fmt --all -- --check -fi - -# raises on any warnings -cargo rustc -- -D warnings +# raises on any formatting errors +cargo +stable fmt --all -- --check -cargo build +RUSTFLAGS="-D warnings" cargo build cargo test -cargo bench + +# run examples +cd arrow cargo run --example builders cargo run --example dynamic_types cargo run --example read_csv +cargo run --example read_csv_infer_schema popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6deb339f4c2f0..3ec430e81a40b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,13 +16,10 @@ # under the License. cmake_minimum_required(VERSION 3.2) +message(STATUS "Building using CMake version: ${CMAKE_VERSION}") + +set(ARROW_VERSION "0.13.0-SNAPSHOT") -# Extract Arrow version number -file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../java/pom.xml" POM_XML) -string(REGEX MATCHALL - "\n [^<]+" ARROW_VERSION_TAG "${POM_XML}") -string(REGEX REPLACE - "(\n |)" "" ARROW_VERSION "${ARROW_VERSION_TAG}") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -50,6 +47,8 @@ message(STATUS "Arrow version: " "${ARROW_VERSION_MAJOR}.${ARROW_VERSION_MINOR}.${ARROW_VERSION_PATCH} " "(full: '${ARROW_VERSION}')") +set(ARROW_SOURCE_DIR ${PROJECT_SOURCE_DIR}) +set(ARROW_BINARY_DIR ${PROJECT_BINARY_DIR}) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") @@ -65,6 +64,12 @@ if(POLICY CMP0054) cmake_policy(SET CMP0054 NEW) endif() +# don't ignore _ROOT variables in find_package +if(POLICY CMP0074) + # https://cmake.org/cmake/help/v3.12/policy/CMP0074.html + cmake_policy(SET CMP0074 NEW) +endif() + set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(CLANG_FORMAT_VERSION "6.0") @@ -84,12 +89,6 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR INFER_FOUND) set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -find_program(CCACHE_FOUND ccache) -if(CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) -endif(CCACHE_FOUND) - # ---------------------------------------------------------------------- # cmake options @@ -114,38 +113,94 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Run the test suite using valgrind --tool=memcheck" OFF) - option(ARROW_BUILD_TESTS - "Build the Arrow googletest unit tests" + option(ARROW_USE_ASAN + "Enable Address Sanitizer checks" + OFF) + + option(ARROW_USE_CCACHE + "Use ccache when compiling (if available)" ON) - set(ARROW_TEST_LINKAGE "shared" CACHE STRING - "Linkage of Arrow libraries with unit tests executables. \ -static|shared (default shared)") + option(ARROW_USE_TSAN + "Enable Thread Sanitizer checks" + OFF) - set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING - "Only build unit tests having the indicated label or labels. \ -Pass multiple labels by dividing with semicolons") + option(ARROW_BUILD_TESTS + "Build the Arrow googletest unit tests, default OFF" + OFF) option(ARROW_BUILD_BENCHMARKS - "Build the Arrow micro benchmarks" + "Build the Arrow micro benchmarks, default OFF" OFF) + option(ARROW_BUILD_EXAMPLES + "Build the Arrow examples, default OFF" + OFF) + + set(ARROW_TEST_LINKAGE "shared" CACHE STRING + "Linkage of Arrow libraries with unit tests executables. \ +static|shared (default shared)") + option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - option(ARROW_COMPUTE - "Build the Arrow Compute Modules" + option(ARROW_FUZZING + "Build Arrow Fuzzing executables" + OFF) + + # Disable this option to exercise non-SIMD fallbacks + option(ARROW_USE_SIMD + "Build with SIMD optimizations" ON) - option(ARROW_EXTRA_ERROR_CONTEXT - "Compile with extra error context (line numbers, code)" + option(ARROW_ALTIVEC + "Build Arrow with Altivec" + ON) + + option(ARROW_BUILD_UTILITIES + "Build Arrow commandline utilities" + ON) + + option(ARROW_RPATH_ORIGIN + "Build Arrow libraries with RATH set to \$ORIGIN" + OFF) + + option(ARROW_INSTALL_NAME_RPATH + "Build Arrow libraries with install_name set to @rpath" + ON) + + option(ARROW_GENERATE_COVERAGE + "Build with C++ code coverage enabled" OFF) + option(ARROW_VERBOSE_LINT + "If off, 'quiet' flags will be passed to linting tools" + OFF) + + option(ARROW_GGDB_DEBUG + "Pass -ggdb flag to debug builds" + ON) + + #---------------------------------------------------------------------- + # Project components to enable / disable building + + option(ARROW_COMPUTE + "Build the Arrow Compute Modules" + ON) + option(ARROW_FLIGHT "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" OFF) + option(ARROW_GANDIVA + "Build the Gandiva libraries" + OFF) + + option(ARROW_PARQUET + "Build the Parquet libraries" + OFF) + option(ARROW_IPC "Build the Arrow IPC extensions" ON) @@ -170,58 +225,49 @@ Pass multiple labels by dividing with semicolons") "Build the Arrow HDFS bridge" ON) - option(ARROW_BOOST_USE_SHARED - "Rely on boost shared libraries where relevant" - ON) - - option(ARROW_BOOST_VENDORED - "Use vendored Boost instead of existing Boost" - OFF) - - option(ARROW_PROTOBUF_USE_SHARED - "Rely on Protocol Buffers shared libraries where relevant" - OFF) - option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) - option(ARROW_FUZZING - "Build Arrow Fuzzing executables" + option(ARROW_HIVESERVER2 + "Build the HiveServer2 client and Arrow adapter" OFF) - # Disable this option to exercise non-SIMD fallbacks - option(ARROW_USE_SIMD - "Build with SIMD optimizations" - ON) + option(ARROW_PLASMA + "Build the plasma object store along with Arrow" + OFF) - option(ARROW_ALTIVEC - "Build Arrow with Altivec" - ON) + option(ARROW_PLASMA_JAVA_CLIENT + "Build the plasma object store java client" + OFF) - option(ARROW_BUILD_UTILITIES - "Build Arrow commandline utilities" - ON) + #---------------------------------------------------------------------- + # Thirdparty toolchain options - option(ARROW_RPATH_ORIGIN - "Build Arrow libraries with RATH set to \$ORIGIN" + option(ARROW_VERBOSE_THIRDPARTY_BUILD + "If off, output from ExternalProjects will be logged to files rather than shown" OFF) - option(ARROW_INSTALL_NAME_RPATH - "Build Arrow libraries with install_name set to @rpath" + option(ARROW_BOOST_USE_SHARED + "Rely on boost shared libraries where relevant" ON) - option(ARROW_HIVESERVER2 - "Build the HiveServer2 client and Arrow adapter" + option(ARROW_BOOST_VENDORED + "Use vendored Boost instead of existing Boost. \ +Note that this requires linking Boost statically" OFF) - option(ARROW_PLASMA - "Build the plasma object store along with Arrow" + option(ARROW_PROTOBUF_USE_SHARED + "Rely on Protocol Buffers shared libraries where relevant" OFF) - option(ARROW_PLASMA_JAVA_CLIENT - "Build the plasma object store java client" - OFF) + option(ARROW_WITH_BACKTRACE + "Build with backtrace support" + ON) + + option(ARROW_USE_GLOG + "Build libraries with glog support for pluggable logging" + ON) option(ARROW_WITH_BROTLI "Build with Brotli compression" @@ -253,21 +299,8 @@ Pass multiple labels by dividing with semicolons") "Build with zstd compression" ${ARROW_WITH_ZSTD_DEFAULT}) - option(ARROW_GENERATE_COVERAGE - "Build with C++ code coverage enabled" - OFF) - - option(ARROW_VERBOSE_THIRDPARTY_BUILD - "If off, output from ExternalProjects will be logged to files rather than shown" - OFF) - - option(ARROW_VERBOSE_LINT - "If off, 'quiet' flags will be passed to linting tools" - OFF) - - option(ARROW_USE_GLOG - "Build libraries with glog support for pluggable logging" - ON) + #---------------------------------------------------------------------- + # Windows options if (MSVC) option(ARROW_USE_CLCACHE @@ -288,10 +321,8 @@ Pass multiple labels by dividing with semicolons") OFF) endif() - # Parquet-related build options - option(ARROW_PARQUET - "Build the Parquet libraries" - OFF) + #---------------------------------------------------------------------- + # Parquet build options option(PARQUET_MINIMAL_DEPENDENCY "Depend only on Thirdparty headers to build libparquet. \ @@ -306,9 +337,11 @@ Always OFF if building binaries" "Build the Parquet examples. Requires static libraries to be built." OFF) - # Gandiva related build options - option(ARROW_GANDIVA - "Build the Gandiva libraries" + #---------------------------------------------------------------------- + # Gandiva build options + + option(ARROW_GANDIVA_JAVA + "Build the Gandiva JNI wrappers" OFF) # ARROW-3860: Temporary workaround @@ -316,16 +349,41 @@ Always OFF if building binaries" "Include -static-libstdc++ -static-libgcc when linking with Gandiva static libraries" OFF) - option(ARROW_GANDIVA_JAVA - "Build the Gandiva JNI wrappers" - ON) + set(ARROW_GANDIVA_PC_CXX_FLAGS "" CACHE STRING + "Compiler flags to append when pre-compiling Gandiva operations") - option(ARROW_GANDIVA_BUILD_TESTS - "Build the Gandiva googletest unit tests" - ON) + #---------------------------------------------------------------------- + # Advanced developer options + option(ARROW_EXTRA_ERROR_CONTEXT + "Compile with extra error context (line numbers, code)" + OFF) + + option(ARROW_OPTIONAL_INSTALL + "If enabled install ONLY targets that have already been built. Please be \ +advised that if this is enabled 'install' will fail silently on components \ +that have not been built" + OFF) endif() +# Needed for linting targets, etc. +find_package(PythonInterp) + +if (ARROW_USE_CCACHE) + find_program(CCACHE_FOUND ccache) + if(CCACHE_FOUND) + message(STATUS "Using ccache: ${CCACHE_FOUND}") + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) + endif(CCACHE_FOUND) +endif() + +if (ARROW_OPTIONAL_INSTALL) + # Don't make the "install" target depend on the "all" target + set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) + + set(INSTALL_IS_OPTIONAL OPTIONAL) +endif() ############################################################ # "make lint" target @@ -334,69 +392,68 @@ if (NOT ARROW_VERBOSE_LINT) set(ARROW_LINT_QUIET "--quiet") endif() -if (UNIX) +if (NOT LINT_EXCLUSIONS_FILE) + # source files matching a glob from a line in this file + # will be excluded from linting (cpplint, clang-tidy, clang-format) + set(LINT_EXCLUSIONS_FILE ${BUILD_SUPPORT_DIR}/lint_exclusions.txt) +endif() - file(GLOB_RECURSE LINT_FILES - "${CMAKE_CURRENT_SOURCE_DIR}/src/*.h" - "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc" - ) - - FOREACH(item ${LINT_FILES}) - IF(NOT ((item MATCHES "_generated.h") OR - (item MATCHES "pyarrow_api.h") OR - (item MATCHES "pyarrow_lib.h") OR - (item MATCHES "xxhash.h") OR - (item MATCHES "xxhash.cc") OR - (item MATCHES "config.h") OR - (item MATCHES "util/date.h") OR - (item MATCHES "util/string_view/") OR - (item MATCHES "util/variant") OR - (item MATCHES "zmalloc.h") OR - (item MATCHES "ae.h"))) - LIST(APPEND FILTERED_LINT_FILES ${item}) - ENDIF() - ENDFOREACH(item ${LINT_FILES}) - - find_program(CPPLINT_BIN NAMES cpplint cpplint.py HINTS ${BUILD_SUPPORT_DIR}) - message(STATUS "Found cpplint executable at ${CPPLINT_BIN}") - - # Full lint - # Balancing act: cpplint.py takes a non-trivial time to launch, - # so process 12 files per invocation, while still ensuring parallelism - add_custom_target(lint echo ${FILTERED_LINT_FILES} | xargs -n12 -P8 - ${CPPLINT_BIN} - --verbose=2 ${ARROW_LINT_QUIET} - --linelength=90 - --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references,-build/include_order - ) -endif (UNIX) +find_program(CPPLINT_BIN NAMES cpplint cpplint.py HINTS ${BUILD_SUPPORT_DIR}) +message(STATUS "Found cpplint executable at ${CPPLINT_BIN}") + +add_custom_target(lint + ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/run_cpplint.py + --cpplint_binary ${CPPLINT_BIN} + --exclude_globs ${LINT_EXCLUSIONS_FILE} + --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/src + ${ARROW_LINT_QUIET}) ############################################################ # "make format" and "make check-format" targets ############################################################ - -# runs clang format and updates files in place. -add_custom_target(format ${BUILD_SUPPORT_DIR}/run_clang_format.py - ${CLANG_FORMAT_BIN} - ${BUILD_SUPPORT_DIR}/clang_format_exclusions.txt - ${CMAKE_CURRENT_SOURCE_DIR}/src --fix ${ARROW_LINT_QUIET}) - -# runs clang format and exits with a non-zero exit code if any files need to be reformatted -add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run_clang_format.py - ${CLANG_FORMAT_BIN} - ${BUILD_SUPPORT_DIR}/clang_format_exclusions.txt - ${CMAKE_CURRENT_SOURCE_DIR}/src ${ARROW_LINT_QUIET}) +if (${CLANG_FORMAT_FOUND}) + # runs clang format and updates files in place. + add_custom_target(format + ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/run_clang_format.py + --clang_format_binary ${CLANG_FORMAT_BIN} + --exclude_globs ${LINT_EXCLUSIONS_FILE} + --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/src + --fix + ${ARROW_LINT_QUIET}) + + # runs clang format and exits with a non-zero exit code if any files need to be reformatted + add_custom_target(check-format + ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/run_clang_format.py + --clang_format_binary ${CLANG_FORMAT_BIN} + --exclude_globs ${LINT_EXCLUSIONS_FILE} + --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/src + ${ARROW_LINT_QUIET}) +endif() ############################################################ # "make clang-tidy" and "make check-clang-tidy" targets ############################################################ if (${CLANG_TIDY_FOUND}) + # TODO check to make sure .clang-tidy is being respected + # runs clang-tidy and attempts to fix any warning automatically - add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) + add_custom_target(clang-tidy + ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/run_clang_tidy.py + --clang_tidy_binary ${CLANG_TIDY_BIN} + --exclude_globs ${LINT_EXCLUSIONS_FILE} + --compile_commands ${CMAKE_BINARY_DIR}/compile_commands.json + --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/src + --fix + ${ARROW_LINT_QUIET}) + # runs clang-tidy and exits with a non-zero exit code if any errors are found. - add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json - 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore | sed -e '/_generated/g'`) + add_custom_target(check-clang-tidy + ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/run_clang_tidy.py + --clang_tidy_binary ${CLANG_TIDY_BIN} + --exclude_globs ${LINT_EXCLUSIONS_FILE} + --compile_commands ${CMAKE_BINARY_DIR}/compile_commands.json + --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/src + ${ARROW_LINT_QUIET}) endif() if (ARROW_ONLY_LINT) @@ -409,11 +466,17 @@ endif() ############################################################ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) + # Currently the compression tests require at least these libraries; bz2 and + # zstd are optional. See ARROW-3984 set(ARROW_WITH_BROTLI ON) set(ARROW_WITH_LZ4 ON) set(ARROW_WITH_SNAPPY ON) set(ARROW_WITH_ZLIB ON) - set(ARROW_WITH_ZSTD ON) +endif() + +if(ARROW_BUILD_TESTS) + # JSON parsing of arrays is required for Arrow unit tests + set(ARROW_IPC ON) endif() if(PARQUET_BUILD_EXAMPLES OR PARQUET_BUILD_EXECUTABLES) @@ -436,20 +499,26 @@ endif() if(NOT ARROW_BUILD_TESTS) set(NO_TESTS 1) +else() + add_custom_target(all-tests) + add_custom_target(unittest ctest -L unittest) + add_dependencies(unittest all-tests) endif() if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) +else() + add_custom_target(all-benchmarks) + add_custom_target(benchmark ctest -L benchmark) + add_dependencies(benchmark all-benchmarks) endif() -if (NOT ARROW_FUZZING) - set(NO_FUZZING 1) +if(NOT ARROW_BUILD_EXAMPLES) + set(NO_EXAMPLES 1) endif() -if (ARROW_TENSORFLOW) - # TensorFlow uses the old GLIBCXX ABI, so we have to use it too - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") +if (NOT ARROW_FUZZING) + set(NO_FUZZING 1) endif() if (MSVC AND ARROW_USE_CLCACHE AND @@ -482,8 +551,6 @@ include(SetupCxxFlags) # Dependencies ############################################################ -add_custom_target(arrow_dependencies) - include(BuildUtils) enable_testing() @@ -635,48 +702,65 @@ endif(UNIX) ############################################################ set(ARROW_LINK_LIBS) +set(ARROW_SHARED_INSTALL_INTERFACE_LIBS) +set(ARROW_STATIC_INSTALL_INTERFACE_LIBS) # Libraries to link statically with libarrow.so set(ARROW_STATIC_LINK_LIBS double-conversion_static) +set(ARROW_STATIC_INSTALL_INTERFACE_LIBS double-conversion) if (ARROW_WITH_BROTLI) - SET(ARROW_STATIC_LINK_LIBS + list(APPEND + ARROW_STATIC_LINK_LIBS brotli_dec_static brotli_enc_static - brotli_common_static - ${ARROW_STATIC_LINK_LIBS}) + brotli_common_static) + list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + brotlidec + brotlienc + brotlicommon) endif() if (ARROW_WITH_BZ2) - SET(ARROW_STATIC_LINK_LIBS bz2_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS bz2_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS bz2) endif() if (ARROW_WITH_LZ4) - SET(ARROW_STATIC_LINK_LIBS lz4_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS lz4_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4) endif() if (ARROW_WITH_SNAPPY) - SET(ARROW_STATIC_LINK_LIBS snappy_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS snappy_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS snappy) endif() if (ARROW_WITH_ZLIB) - SET(ARROW_STATIC_LINK_LIBS ${ZLIB_LIBRARY} ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ZLIB_LIBRARY}) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS z) endif() if (ARROW_WITH_ZSTD) - SET(ARROW_STATIC_LINK_LIBS zstd_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS zstd_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS zstd) endif() if (ARROW_ORC) - SET(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} - orc_static) + list(APPEND ARROW_STATIC_LINK_LIBS orc_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc) endif() if (ARROW_USE_GLOG) - SET(ARROW_STATIC_LINK_LIBS glog_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS glog_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS glog) + add_definitions("-DARROW_USE_GLOG") endif() +add_custom_target(arrow_dependencies) +add_dependencies(arrow_dependencies toolchain) + if (ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) endif() @@ -687,15 +771,24 @@ set(ARROW_SHARED_PRIVATE_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY}) -set(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} +list(APPEND + ARROW_STATIC_LINK_LIBS ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY}) +list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + boost_system + boost_filesystem + boost_regex) + if (NOT MSVC) - set(ARROW_LINK_LIBS - ${ARROW_LINK_LIBS} + list(APPEND + ARROW_LINK_LIBS + ${CMAKE_DL_LIBS}) + list(APPEND + ARROW_SHARED_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) endif() @@ -703,8 +796,8 @@ set(ARROW_TEST_STATIC_LINK_LIBS arrow_testing_static arrow_static ${ARROW_LINK_LIBS} - gtest_main_static - gtest_static) + ${GTEST_MAIN_LIBRARY} + ${GTEST_LIBRARY}) set(ARROW_TEST_SHARED_LINK_LIBS arrow_testing_shared @@ -714,8 +807,8 @@ set(ARROW_TEST_SHARED_LINK_LIBS ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY} - gtest_main_static - gtest_static) + ${GTEST_MAIN_LIBRARY} + ${GTEST_LIBRARY}) if(NOT MSVC) set(ARROW_TEST_SHARED_LINK_LIBS @@ -723,19 +816,21 @@ if(NOT MSVC) ${CMAKE_DL_LIBS}) endif() -if ("${ARROW_TEST_LINKAGE}" STREQUAL "shared") +if (ARROW_BUILD_TESTS AND "${ARROW_TEST_LINKAGE}" STREQUAL "shared") if (NOT ARROW_BUILD_SHARED) message(FATAL_ERROR "If using shared linkage for unit tests, must also \ pass ARROW_BUILD_SHARED=on") endif() # Use shared linking for unit tests if it's available set(ARROW_TEST_LINK_LIBS ${ARROW_TEST_SHARED_LINK_LIBS}) + set(ARROW_EXAMPLE_LINK_LIBS arrow_shared) else() if (NOT ARROW_BUILD_STATIC) message(FATAL_ERROR "If using static linkage for unit tests, must also \ pass ARROW_BUILD_STATIC=on") endif() set(ARROW_TEST_LINK_LIBS ${ARROW_TEST_STATIC_LINK_LIBS}) + set(ARROW_EXAMPLE_LINK_LIBS arrow_static) endif() if (ARROW_BUILD_BENCHMARKS) @@ -744,40 +839,28 @@ if (ARROW_BUILD_BENCHMARKS) ${ARROW_TEST_LINK_LIBS}) endif() +set(ARROW_SYSTEM_LINK_LIBS) + if (ARROW_JEMALLOC) add_definitions(-DARROW_JEMALLOC) add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR}) - - if (NOT WIN32 AND NOT APPLE) - set(ARROW_JEMALLOC_LINK_LIBS - jemalloc_static - # For glibc <2.17 we need to link to librt. - # As we compile with --as-needed by default, the linker will omit this - # dependency if not required. - rt - ) - else() - set(ARROW_JEMALLOC_LINK_LIBS - jemalloc_static - ) - endif() - set(ARROW_SHARED_PRIVATE_LINK_LIBS - ${ARROW_SHARED_PRIVATE_LINK_LIBS} - ${ARROW_JEMALLOC_LINK_LIBS}) - set(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} - ${ARROW_JEMALLOC_LINK_LIBS}) + list(APPEND ARROW_SYSTEM_LINK_LIBS jemalloc_static) endif(ARROW_JEMALLOC) -if (PTHREAD_LIBRARY) - set(ARROW_LINK_LIBS - ${ARROW_LINK_LIBS} - pthreadshared) - set(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} - pthreadshared) +if (THREADS_FOUND) + list(APPEND ARROW_SYSTEM_LINK_LIBS Threads::Threads) +endif() + +if (NOT WIN32 AND NOT APPLE) + # Pass -lrt on Linux only + list(APPEND ARROW_SYSTEM_LINK_LIBS rt) endif() +list(APPEND ARROW_LINK_LIBS ${ARROW_SYSTEM_LINK_LIBS}) +list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_SYSTEM_LINK_LIBS}) +list(APPEND ARROW_SHARED_INSTALL_INTERFACE_LIBS ${ARROW_SYSTEM_LINK_LIBS}) +list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_SYSTEM_LINK_LIBS}) + ############################################################ # Subdirectories ############################################################ @@ -788,18 +871,6 @@ endif() add_subdirectory(src/arrow) -if(ARROW_FLIGHT) - add_subdirectory(src/arrow/flight) -endif() - -if(ARROW_PYTHON) - add_subdirectory(src/arrow/python) -endif() - -if(ARROW_HIVESERVER2) - add_subdirectory(src/arrow/dbi/hiveserver2) -endif() - if(ARROW_PARQUET) add_subdirectory(src/parquet) add_subdirectory(tools/parquet) @@ -812,6 +883,11 @@ if(ARROW_GANDIVA) add_subdirectory(src/gandiva) endif() +if(ARROW_BUILD_EXAMPLES) + add_custom_target(runexample ctest -L example) + add_subdirectory(examples/arrow) +endif() + include(CMakePackageConfigHelpers) # Makes the project importable from the build directory diff --git a/cpp/Dockerfile b/cpp/Dockerfile index c4791019634c1..17d332d22bed3 100644 --- a/cpp/Dockerfile +++ b/cpp/Dockerfile @@ -18,7 +18,8 @@ FROM ubuntu:18.04 # install build essentials -RUN apt-get update -y -q && \ +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ ca-certificates \ ccache \ @@ -27,9 +28,11 @@ RUN apt-get update -y -q && \ git \ ninja-build \ pkg-config \ + tzdata \ wget # install conda and required packages +ARG EXTRA_CONDA_PKGS ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda ADD ci/docker_install_conda.sh \ @@ -37,9 +40,10 @@ ADD ci/docker_install_conda.sh \ ci/conda_env_unix.yml \ /arrow/ci/ RUN arrow/ci/docker_install_conda.sh && \ - conda install -c conda-forge \ + conda install -q -c conda-forge \ --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_unix.yml && \ + --file arrow/ci/conda_env_unix.yml \ + $EXTRA_CONDA_PKGS && \ conda clean --all ENV CC=gcc \ diff --git a/cpp/README.md b/cpp/README.md index 394b23d69f8fc..7312a31f23779 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -30,16 +30,41 @@ in-source and out-of-source builds with the latter one being preferred. Building Arrow requires: * A C++11-enabled compiler. On Linux, gcc 4.8 and higher should be sufficient. -* CMake +* CMake 3.2 or higher * Boost +* Bison/flex (for building Apache Thrift from source only, +a parquet dependency.) + +Testing arrow with ctest requires: + +* python On Ubuntu/Debian you can install the requirements with: ```shell -sudo apt-get install cmake \ +sudo apt-get install \ + autoconf \ + build-essential \ + cmake \ libboost-dev \ libboost-filesystem-dev \ - libboost-system-dev + libboost-regex-dev \ + libboost-system-dev \ + python \ + bison \ + flex +``` + +On Alpine Linux: + +```shell +apk add autoconf \ + bash \ + boost-dev \ + cmake \ + g++ \ + gcc \ + make ``` On macOS, you can use [Homebrew][1]: @@ -54,25 +79,29 @@ If you are developing on Windows, see the [Windows developer guide][2]. ## Building Arrow -Simple debug build: +Simple release build: git clone https://github.com/apache/arrow.git cd arrow/cpp - mkdir debug - cd debug - cmake .. + mkdir release + cd release + cmake -DARROW_BUILD_TESTS=ON .. make unittest -Simple release build: +Simple debug build: git clone https://github.com/apache/arrow.git cd arrow/cpp - mkdir release - cd release - cmake .. -DCMAKE_BUILD_TYPE=Release + mkdir debug + cd debug + cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON .. make unittest -Detailed unit test logs will be placed in the build directory under `build/test-logs`. +If you do not need to build the test suite, you can omit the +`ARROW_BUILD_TESTS` option (the default is not to build the unit tests). + +Detailed unit test logs will be placed in the build directory under +`build/test-logs`. On some Linux distributions, running the test suite might require setting an explicit locale. If you see any locale-related errors, try setting the @@ -82,7 +111,35 @@ environment variable (which requires the `locales` package or equivalent): export LC_ALL="en_US.UTF-8" ``` -## Building and Developing Parquet Libraries +## Modular Build Targets + +Since there are several major parts of the C++ project, we have provided +modular CMake targets for building each library component, group of unit tests +and benchmarks, and their dependencies: + +* `make arrow` for Arrow core libraries +* `make parquet` for Parquet libraries +* `make gandiva` for Gandiva (LLVM expression compiler) libraries +* `make plasma` for Plasma libraries, server + +To build the unit tests or benchmarks, add `-tests` or `-benchmarks` to the +target name. So `make arrow-tests` will build the Arrow core unit tests. Using +the `-all` target, e.g. `parquet-all`, will build everything. + +If you wish to only build and install one or more project subcomponents, we +have provided the CMake option `ARROW_OPTIONAL_INSTALL` to only install targets +that have been built. For example, if you only wish to build the Parquet +libraries, its tests, and its dependencies, you can run: + +``` +cmake .. -DARROW_PARQUET=ON -DARROW_OPTIONAL_INSTALL=ON -DARROW_BUILD_TESTS=ON +make parquet +make install +``` + +If you omit an explicit target when invoking `make`, all targets will be built. + +## Parquet Development Notes To build the C++ libraries for Apache Parquet, add the flag `-DARROW_PARQUET=ON` when invoking CMake. The Parquet libraries and unit tests @@ -117,10 +174,10 @@ not use the macro. Follow the directions for simple build except run cmake with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: - cmake -DARROW_BUILD_BENCHMARKS=ON .. + cmake -DARROW_BUILD_TESTS=ON -DARROW_BUILD_BENCHMARKS=ON .. and instead of make unittest run either `make; ctest` to run both unit tests -and benchmarks or `make runbenchmark` to run only the benchmark tests. +and benchmarks or `make benchmark` to run only the benchmark tests. Benchmark logs will be placed in the build directory under `build/benchmark-logs`. @@ -250,7 +307,7 @@ The optional `gandiva` libraries and tests can be built by passing `-DARROW_GANDIVA=on`. ```shell -cmake .. -DARROW_GANDIVA=on +cmake .. -DARROW_GANDIVA=ON -DARROW_BUILD_TESTS=ON make ctest -L gandiva ``` @@ -258,6 +315,55 @@ ctest -L gandiva This library is still in Alpha stages, and subject to API changes without deprecation warnings. +### Building and developing Flight (optional) + +In addition to the Arrow dependencies, Flight requires: +* gRPC (>= 1.14, roughly) +* Protobuf (>= 3.6, earlier versions may work) +* c-ares (used by gRPC) + +By default, Arrow will try to download and build these dependencies +when building Flight. + +The optional `flight` libraries and tests can be built by passing +`-DARROW_FLIGHT=ON`. + +```shell +cmake .. -DARROW_FLIGHT=ON -DARROW_BUILD_TESTS=ON +make +``` + +You can also use existing installations of the extra dependencies. +When building, set the environment variables `GRPC_HOME` and/or +`PROTOBUF_HOME` and/or `CARES_HOME`. + +You may try using system libraries for gRPC and Protobuf, but these +are likely to be too old. + +On Ubuntu/Debian, you can try: + +```shell +sudo apt-get install libgrpc-dev libgrpc++-dev protobuf-compiler-grpc libc-ares-dev +``` + +Note that the version of gRPC in Ubuntu 18.10 is too old; you will +have to install gRPC from source. (Ubuntu 19.04/Debian Sid may work.) + +On macOS, you can try [Homebrew][1]: + +```shell +brew install grpc +``` + +You can also install gRPC from source. In this case, you must install +gRPC to generate the necessary files for CMake to find gRPC: + +```shell +cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DgRPC_PROTOBUF_PROVIDER=package -DgRPC_ZLIB_PROVIDER=package -DgRPC_CARES_PROVIDER=package -DgRPC_SSL_PROVIDER=package +``` + +You can then specify `-DgRPC_DIR` to `cmake`. + ### API documentation To generate the (html) API documentation, run the following command in the apidoc @@ -269,9 +375,13 @@ This requires [Doxygen](http://www.doxygen.org) to be installed. ## Development -This project follows [Google's C++ Style Guide][3] with minor exceptions. We do -not encourage anonymous namespaces and we relax the line length restriction to -90 characters. +This project follows [Google's C++ Style Guide][3] with minor exceptions: + + * We relax the line length restriction to 90 characters. + * We use the NULLPTR macro defined in `src/arrow/util/macros.h` to + support building C++/CLI (ARROW-1134) + * We use doxygen style comments ("///") instead of line comments ("//") + in header files. ### Memory Pools @@ -281,6 +391,12 @@ which use the default pool without explicitly passing it. You can disable these constructors in your application (so that you are accounting properly for all memory allocations) by defining `ARROW_NO_DEFAULT_MEMORY_POOL`. +### Header files + +We use the `.h` extension for C++ header files. Any header file name not +containing `internal` is considered to be a public header, and will be +automatically installed by the build. + ### Error Handling and Exceptions For error handling, we use `arrow::Status` values instead of throwing C++ @@ -374,6 +490,12 @@ You may find the required packages at http://releases.llvm.org/download.html or use the Debian/Ubuntu APT repositories on https://apt.llvm.org/. On macOS with [Homebrew][1] you can get it via `brew install llvm@6`. +Depending on how you installed clang-format, the build system may not be able +to find it. You can provide an explicit path to your LLVM installation (or the +root path for the clang tools) with the environment variable +`$CLANG_TOOLS_PATH` or by passing `-DClangTools_PATH=$PATH_TO_CLANG_TOOLS` when +invoking CMake. + ## Checking for ABI and API stability To build ABI compliance reports, you need to install the two tools @@ -427,6 +549,14 @@ both of these options would be used rarely. Current known uses-cases when they a * Parameterized tests in google test. +## CMake version requirements + +We support CMake 3.2 and higher. Some features require a newer version of CMake: + +* Building the benchmarks requires 3.6 or higher +* Building zstd from source requires 3.7 or higher +* Building Gandiva JNI bindings requires 3.11 or higher + [1]: https://brew.sh/ [2]: https://github.com/apache/arrow/blob/master/cpp/apidoc/Windows.md [3]: https://google.github.io/styleguide/cppguide.html diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index e5285873c9e02..38ce17fb810cc 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -518,7 +518,7 @@ HIDE_UNDOC_CLASSES = NO # included in the documentation. # The default value is: NO. -HIDE_FRIEND_COMPOUNDS = NO +HIDE_FRIEND_COMPOUNDS = YES # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these @@ -741,7 +741,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = NO +QUIET = YES # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES @@ -779,7 +779,7 @@ WARN_NO_PARAMDOC = NO # a warning is encountered. # The default value is: NO. -WARN_AS_ERROR = NO +WARN_AS_ERROR = YES # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which @@ -858,7 +858,7 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = ../src/arrow/vendored # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -2075,7 +2075,8 @@ INCLUDE_FILE_PATTERNS = PREDEFINED = __attribute__(x)= \ __declspec(x)= \ ARROW_EXPORT= \ - ARROW_EXTERN_TEMPLATE= + ARROW_EXTERN_TEMPLATE= \ + ARROW_DEPRECATED(x)= # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md index 5199c2fdbfa59..8a724d0342be7 100644 --- a/cpp/apidoc/Windows.md +++ b/cpp/apidoc/Windows.md @@ -38,10 +38,11 @@ Launch cmd.exe and run following commands: conda config --add channels conda-forge ``` -Now, you can bootstrap a build environment +Now, you can bootstrap a build environment (call from the root directory of the +Arrow codebase): ```shell -conda create -n arrow-dev cmake git boost-cpp flatbuffers rapidjson cmake thrift-cpp snappy zlib brotli gflags lz4-c zstd -c conda-forge +conda create -n arrow-dev --file=ci\conda_env_cpp.yml ``` > **Note:** Make sure to get the `conda-forge` build of `gflags` as the diff --git a/cpp/apidoc/index.md b/cpp/apidoc/index.md index c887a74e64124..076c29726b994 100644 --- a/cpp/apidoc/index.md +++ b/cpp/apidoc/index.md @@ -38,6 +38,5 @@ Table of Contents * Instructions on how to build Arrow C++ on [Windows](Windows.md) * How to access [HDFS](HDFS.md) * Tutorials - * [Convert a vector of row-wise data into an Arrow table](tutorials/row_wise_conversion.md) * [Using the Plasma In-Memory Object Store](tutorials/plasma.md) * [Use Plasma to Access Tensors from C++ in Python](tutorials/tensor_to_py.md) diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md index 472d479c4b2f9..40c5a10603e71 100644 --- a/cpp/apidoc/tutorials/plasma.md +++ b/cpp/apidoc/tutorials/plasma.md @@ -80,7 +80,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); // Disconnect the Plasma client. ARROW_CHECK_OK(client.Disconnect()); } @@ -182,7 +182,7 @@ was written by the `Create` command. int64_t data_size = 100; // The address of the buffer allocated by the Plasma store will be written at // this address. -uint8_t* data; +std::shared_ptr data; // Create a Plasma object by specifying its ID and size. ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); ``` @@ -194,7 +194,7 @@ metadata (as raw bytes) and the fourth argument is the size of the metadata. // Create a Plasma object with metadata. int64_t data_size = 100; std::string metadata = "{'author': 'john'}"; -uint8_t* data; +std::shared_ptr data; client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data); ``` @@ -226,7 +226,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); // Create an object with a fixed ObjectID. ObjectID object_id = ObjectID::from_binary("00000000000000000000"); int64_t data_size = 1000; @@ -332,7 +332,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); ObjectID object_id = ObjectID::from_binary("00000000000000000000"); ObjectBuffer object_buffer; ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); @@ -421,7 +421,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); int fd; ARROW_CHECK_OK(client.Subscribe(&fd)); diff --git a/cpp/apidoc/tutorials/row_wise_conversion.md b/cpp/apidoc/tutorials/row_wise_conversion.md deleted file mode 100644 index 750a923c7846b..0000000000000 --- a/cpp/apidoc/tutorials/row_wise_conversion.md +++ /dev/null @@ -1,194 +0,0 @@ - - -Convert a vector of row-wise data into an Arrow table -===================================================== - -While we want to use columnar data structures to build efficient operations, we -often receive data in a row-wise fashion from other systems. In the following, -we want give a brief introduction into the classes provided by Apache Arrow by -showing how to transform row-wise data into a columnar table. - -The data in this example is stored in the following struct: - -``` -struct data_row { - int64_t id; - double cost; - std::vector cost_components; -}; - -std::vector rows; -``` - -The final representation should be an `arrow::Table` which in turn is made up of -an `arrow::Schema` and a list of `arrow::Column`. An `arrow::Column` is again a -named collection of one or more `arrow::Array` instances. As the first step, we -will iterate over the data and build up the arrays incrementally. For this task, -we provide `arrow::ArrayBuilder` classes that help in the construction of the -final `arrow::Array` instances. - -For each type, Arrow has a specially typed builder class. For the primitive -values `id` and `cost` we can use the respective `arrow::Int64Builder` and -`arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two -builders, a top-level `arrow::ListBuilder` that builds the array of offsets and -a nested `arrow::DoubleBuilder` that constructs the underlying values array that -is referenced by the offsets in the former array. - -``` -// The builders are more efficient using -// arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of -// the underlying memory regions in-place. At the moment, arrow::jemalloc is only -// supported on Unix systems, not Windows. - -using arrow::DoubleBuilder; -using arrow::Int64Builder; -using arrow::ListBuilder; - -arrow::MemoryPool* pool = arrow::default_memory_pool(); -Int64Builder id_builder(pool); -DoubleBuilder cost_builder(pool); -std::unique_ptr components_values_builder(new DoubleBuilder(pool)); -ListBuilder components_builder(pool, std::move(components_values_builder)); -``` - -Now we can loop over our existing data and insert it into the builders. The -`Append` calls here may fail (e.g. we cannot allocate enough additional memory). -Thus we need to check their return values. For more information on these values, -check the documentation about `arrow::Status`. - -``` -for (const data_row& row : rows) { - ARROW_RETURN_NOT_OK(id_builder.Append(row.id)); - ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost)); - - // Indicate the start of a new list row. This will memorise the current - // offset in the values builder. - ARROW_RETURN_NOT_OK(components_builder.Append()); - // Store the actual values. The final nullptr argument tells the underyling - // builder that all added values are valid, i.e. non-null. - ARROW_RETURN_NOT_OK(components_values_builder->Append( - row.cost_components.data(), row.cost_components.size(), - nullptr); -} -``` - -At the end, we finalise the arrays, declare the (type) schema and combine them - into a single `arrow::Table`: - -``` -std::shared_ptr id_array; -ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array)); -std::shared_ptr cost_array; -ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array)); -std::shared_ptr cost_components_array; -ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array)); - -std::vector> schema_vector = { - arrow::field("id", arrow::int64()), - arrow::field("cost", arrow::float64()), - arrow::field("cost_components", arrow::list(arrow::float64())) -}; -auto schema = std::make_shared(schema_vector); - -std::shared_ptr table = arrow::Table::Make(schema, - {id_array, cost_array, cost_components_array}); -``` - -The final `table` variable is the one we then can pass on to other functions -that can consume Apache Arrow memory structures. This object has ownership of -all referenced data, thus we don't have to care about undefined references once -we leave the scope of the function building the table and its underlying arrays. - - - -Converting an Arrow Table back into row-wise representation -=========================================================== - -To convert an Arrow table back into the same row-wise representation as in the -above section, we first will check that the table conforms to our expected -schema and then will build up the vector of rows incrementally. - -For the check if the table is as expected, we can utilise solely its schema. - -``` -// This is our input that was passed in from the outside. -std::shared_ptr table; - -std::vector> schema_vector = { - arrow::field("id", arrow::int64()), - arrow::field("cost", arrow::float64()), - arrow::field("cost_components", arrow::list(arrow::float64())) -}; -auto expected_schema = std::make_shared(schema_vector); - -if (!expected_schema->Equals(*table->schema())) { - // The table doesn't have the expected schema thus we cannot directly - // convert it to our target representation. - // TODO: Implement your custom error handling logic here. -} -``` - -As we have ensured that the table has the expected structure, we can unpack the -underlying arrays. For the primitive columns `id` and `cost` we can use the high -level functions to get the values whereas for the nested column -`cost_components` we need to access the C-pointer to the data to copy its -contents into the resulting `std::vector`. Here we need to be care to -also add the offset to the pointer. This offset is needed to enable zero-copy -slicing operations. While this could be adjusted automatically for double -arrays, this cannot be done for the accompanying bitmap as often the slicing -border would be inside a byte. - -``` -// For simplicity, we assume that all arrays consist of a single chunk here. -// In a productive implementation this should either be explicitly check or code -// added that can treat chunked arrays. - -auto ids = std::static_pointer_cast( - table->column(0)->data()->chunk(0)); -auto costs = std::static_pointer_castcolumn(1)->data()->chunk(0)); -auto cost_components = std::static_pointer_castcolumn(2)->data()->chunk(0)); -auto cost_components_values = std::static_pointer_cast( - cost_components->values()); -// To enable zero-copy slices, the native values pointer might need to account -// for this slicing offset. This is not needed for the higher level functions -// like Value(…) that already account for this offset internally. -const double* cost_components_values_ptr = cost_components_values->data() - + cost_components_values->offset(); -``` - -After we have unpacked the arrays from the table, we can iterate over them in a -row-wise fashion and fill our target, row-wise representation. - -``` -std::vector rows; - -for (int64_t i = 0; i < table->num_rows(); i++) { - // Another simplification in this example is that we assume that there are - // no null entries, e.g. each row is fill with valid values. - int64_t id = ids->Value(i); - double cost = costs->Value(i); - const double* first = cost_components_values_ptr + cost_components->value_offset(i); - const double* last = cost_components_values_ptr + cost_components->value_offset(i + 1); - std::vector components_vec(first, last); - rows.push_back({id, cost, components_vec}); -} -``` diff --git a/cpp/apidoc/tutorials/tensor_to_py.md b/cpp/apidoc/tutorials/tensor_to_py.md index 0be973a4f3df9..cd191fea07d09 100644 --- a/cpp/apidoc/tutorials/tensor_to_py.md +++ b/cpp/apidoc/tutorials/tensor_to_py.md @@ -105,7 +105,7 @@ The `inputs` variable will be a list of Object IDs in their raw byte string form import pyarrow as pa import pyarrow.plasma as plasma -plasma_client = plasma.connect('/tmp/plasma', '', 0) +plasma_client = plasma.connect('/tmp/plasma') # inputs: a list of object ids inputs = [20 * b'1'] diff --git a/cpp/build-support/build-lz4-lib.sh b/cpp/build-support/build-lz4-lib.sh index d33686655a8ac..fa4c61b48d4a7 100755 --- a/cpp/build-support/build-lz4-lib.sh +++ b/cpp/build-support/build-lz4-lib.sh @@ -19,7 +19,7 @@ # export CFLAGS="${CFLAGS} -O3 -fPIC" if [ -z "$MAKELEVEL" ]; then - make -j4 + make -j4 "$@" else - make + make "$@" fi diff --git a/cpp/build-support/iwyu/mappings/arrow-misc.imp b/cpp/build-support/iwyu/mappings/arrow-misc.imp index 8bb65e62d98e3..7ff99108c5aff 100644 --- a/cpp/build-support/iwyu/mappings/arrow-misc.imp +++ b/cpp/build-support/iwyu/mappings/arrow-misc.imp @@ -49,7 +49,7 @@ { symbol: ["shared_ptr", private, "", public ] }, { symbol: ["_Node_const_iterator", private, "", public ] }, { symbol: ["unordered_map<>::mapped_type", private, "", public ] }, - { symbol: ["move", private, "", public ] }, + { symbol: ["std::move", private, "", public ] }, { symbol: ["pair", private, "", public ] }, { symbol: ["errno", private, "", public ] }, { symbol: ["posix_memalign", private, "", public ] } diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py index 4c26927740dbb..ab2de5901a4df 100644 --- a/cpp/build-support/lint_cpp_cli.py +++ b/cpp/build-support/lint_cpp_cli.py @@ -19,8 +19,6 @@ import argparse import re import os -import sys -import traceback parser = argparse.ArgumentParser( description="Check for illegal headers for C++/CLI applications") @@ -34,6 +32,10 @@ _RETURN_NOT_OK_REGEX = re.compile(r'.*\sRETURN_NOT_OK.*') +def _paths(paths): + return [p.strip().replace('/', os.path.sep) for p in paths.splitlines()] + + def _strip_comments(line): m = _STRIP_COMMENT_REGEX.match(line) if not m: @@ -48,11 +50,11 @@ def lint_file(path): (lambda x: '' in x, 'Uses ', []), (lambda x: re.match(_NULLPTR_REGEX, x), 'Uses nullptr', []), (lambda x: re.match(_RETURN_NOT_OK_REGEX, x), - 'Use ARROW_RETURN_NOT_OK in header files', - ['arrow/status.h', - 'test', - 'arrow/util/hash.h', - 'arrow/python/util']) + 'Use ARROW_RETURN_NOT_OK in header files', _paths('''\ + arrow/status.h + test + arrow/util/hash.h + arrow/python/util''')) ] with open(path) as f: @@ -63,28 +65,23 @@ def lint_file(path): continue if rule(stripped_line): - raise Exception('File {0} failed C++/CLI lint check: {1}\n' - 'Line {2}: {3}' - .format(path, why, i + 1, line)) - - -EXCLUSIONS = [ - 'arrow/python/iterators.h', - 'arrow/util/date.h', - 'arrow/util/hashing.h', - 'arrow/util/macros.h', - 'arrow/util/parallel.h', - 'arrow/util/string_view/string_view.hpp', - 'arrow/util/xxhash/xxhash.c', - 'arrow/util/xxhash/xxhash.h', - 'arrow/visitor_inline.h', - 'gandiva/cache.h', - 'gandiva/jni', - 'test', - 'internal' -] - -try: + yield path, why, i, line + + +EXCLUSIONS = _paths('''\ + arrow/python/iterators.h + arrow/util/hashing.h + arrow/util/macros.h + arrow/util/parallel.h + arrow/vendored + arrow/visitor_inline.h + gandiva/cache.h + gandiva/jni + test + internal''') + + +def lint_files(): for dirpath, _, filenames in os.walk(arguments.source_path): for filename in filenames: full_path = os.path.join(dirpath, filename) @@ -100,7 +97,13 @@ def lint_file(path): # Only run on header files if filename.endswith('.h'): - lint_file(full_path) -except Exception: - traceback.print_exc() - sys.exit(1) + yield from lint_file(full_path) + + +if __name__ == '__main__': + failures = list(lint_files()) + for path, why, i, line in failures: + print('File {0} failed C++/CLI lint check: {1}\n' + 'Line {2}: {3}'.format(path, why, i + 1, line)) + if failures: + exit(1) diff --git a/cpp/build-support/clang_format_exclusions.txt b/cpp/build-support/lint_exclusions.txt similarity index 61% rename from cpp/build-support/clang_format_exclusions.txt rename to cpp/build-support/lint_exclusions.txt index c04523af1db81..2964898f4f24d 100644 --- a/cpp/build-support/clang_format_exclusions.txt +++ b/cpp/build-support/lint_exclusions.txt @@ -4,11 +4,6 @@ *pyarrow_lib.h *python/config.h *python/platform.h -*util/date.h -*util/string_view/* -*util/variant.h -*util/variant/* *thirdparty/ae/* -*xxhash.cc -*xxhash.h +*vendored/* *RcppExports.cpp* diff --git a/cpp/build-support/lintutils.py b/cpp/build-support/lintutils.py new file mode 100644 index 0000000000000..0a54daa1ee9bf --- /dev/null +++ b/cpp/build-support/lintutils.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +from fnmatch import fnmatch +from subprocess import Popen + + +def chunk(seq, n): + """ + divide a sequence into equal sized chunks + (the last chunk may be smaller, but won't be empty) + """ + chunks = [] + some = [] + for element in seq: + if len(some) == n: + chunks.append(some) + some = [] + some.append(element) + if len(some) > 0: + chunks.append(some) + return chunks + + +def dechunk(chunks): + "flatten chunks into a single list" + seq = [] + for chunk in chunks: + seq.extend(chunk) + return seq + + +def run_parallel(cmds, **kwargs): + """ + Run each of cmds (with shared **kwargs) using subprocess.Popen + then wait for all of them to complete. + Runs batches of os.cpu_count() * 2 from cmds + returns a list of tuples containing each process' + returncode, stdout, stderr + """ + complete = [] + for cmds_batch in chunk(cmds, os.cpu_count() * 2): + procs_batch = [Popen(cmd, **kwargs) for cmd in cmds_batch] + for proc in procs_batch: + stdout, stderr = proc.communicate() + complete.append((proc.returncode, stdout, stderr)) + return complete + + +_source_extensions = ''' +.h +.cc +'''.split() + + +def get_sources(source_dir, exclude_globs=[]): + sources = [] + for directory, subdirs, basenames in os.walk(source_dir): + for path in [os.path.join(directory, basename) for basename in basenames]: + # filter out non-source files + if os.path.splitext(path)[1] not in _source_extensions: + continue + + path = os.path.abspath(path) + + # filter out files that match the globs in the globs file + if any([fnmatch(path, glob) for glob in exclude_globs]): + continue + + sources.append(path) + return sources + + +def stdout_pathcolonline(completed_process, filenames): + """ + given a completed process which may have reported some files as problematic + by printing the path name followed by ':' then a line number, examine + stdout and return the set of actually reported file names + """ + returncode, stdout, stderr = completed_process + bfilenames = set() + for filename in filenames: + bfilenames.add(filename.encode('utf-8') + b':') + problem_files = set() + for line in stdout.splitlines(): + for filename in bfilenames: + if line.startswith(filename): + problem_files.add(filename.decode('utf-8')) + bfilenames.remove(filename) + break + return problem_files, stdout diff --git a/cpp/build-support/run-clang-tidy.sh b/cpp/build-support/run-clang-tidy.sh deleted file mode 100755 index 75e9458e257ca..0000000000000 --- a/cpp/build-support/run-clang-tidy.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# -# Runs clang format in the given directory -# Arguments: -# $1 - Path to the clang tidy binary -# $2 - Path to the compile_commands.json to use -# $3 - Apply fixes (will raise an error if false and not there where changes) -# $ARGN - Files to run clang-tidy on -# -CLANG_TIDY=$1 -shift -COMPILE_COMMANDS=$1 -shift -APPLY_FIXES=$1 -shift - -# clang format will only find its configuration if we are in -# the source tree or in a path relative to the source tree -if [ "$APPLY_FIXES" == "1" ]; then - $CLANG_TIDY -p $COMPILE_COMMANDS -fix $@ -else - NUM_CORRECTIONS=`$CLANG_TIDY -p $COMPILE_COMMANDS $@ 2>&1 | grep -v Skipping | grep "warnings* generated" | wc -l` - if [ "$NUM_CORRECTIONS" -gt "0" ]; then - echo "clang-tidy had suggested fixes. Please fix these!!!" - exit 1 - fi -fi diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 656ab7bd3b805..6b1c09efb4d8d 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -80,6 +80,10 @@ function setup_sanitizers() { TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" export TSAN_OPTIONS + UBSAN_OPTIONS="$UBSAN_OPTIONS print_stacktrace=1" + UBSAN_OPTIONS="$UBSAN_OPTIONS suppressions=$ROOT/build-support/ubsan-suppressions.txt" + export UBSAN_OPTIONS + # Enable leak detection even under LLVM 3.4, where it was disabled by default. # This flag only takes effect when running an ASAN build. # ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py index 24dcabb8c7169..1d1592d233ea7 100755 --- a/cpp/build-support/run_clang_format.py +++ b/cpp/build-support/run_clang_format.py @@ -16,74 +16,53 @@ # specific language governing permissions and limitations # under the License. +from __future__ import print_function +import lintutils +from subprocess import PIPE import argparse import difflib -import fnmatch import multiprocessing as mp -import os -import subprocess import sys +from functools import partial -class FileChecker(object): +# examine the output of clang-format and if changes are +# present assemble a (unified)patch of the difference +def _check_one_file(completed_processes, filename): + with open(filename, "rb") as reader: + original = reader.read() - def __init__(self, arguments): - self.quiet = arguments.quiet - self.clang_format_binary = arguments.clang_format_binary - - def run(self, filename): - if not self.quiet: - print("Checking {}".format(filename)) - # - # Due to some incompatibilities between Python 2 and - # Python 3, there are some specific actions we take here - # to make sure the difflib.unified_diff call works. - # - # In Python 2, the call to subprocess.check_output return - # a 'str' type. In Python 3, however, the call returns a - # 'bytes' type unless the 'encoding' argument is - # specified. Unfortunately, the 'encoding' argument is not - # in the Python 2 API. We could do an if/else here based - # on the version of Python we are running, but it's more - # straightforward to read the file in binary and do utf-8 - # conversion. In Python 2, it's just converting string - # types to unicode types, whereas in Python 3 it's - # converting bytes types to utf-8 encoded str types. This - # approach ensures that the arguments to - # difflib.unified_diff are acceptable string types in both - # Python 2 and Python 3. - with open(filename, "rb") as reader: - original = reader.read().decode('utf8') + returncode, stdout, stderr = completed_processes[filename] + formatted = stdout + if formatted != original: + # Run the equivalent of diff -u + diff = list(difflib.unified_diff( + original.decode('utf8').splitlines(True), + formatted.decode('utf8').splitlines(True), + fromfile=filename, + tofile="{} (after clang format)".format( + filename))) + else: + diff = None - # Run clang-format and capture its output - formatted = subprocess.check_output( - [self.clang_format_binary, - filename]) - formatted = formatted.decode('utf8') - if formatted != original: - # Run the equivalent of diff -u - diff = list(difflib.unified_diff( - original.splitlines(True), - formatted.splitlines(True), - fromfile=filename, - tofile="{} (after clang format)".format( - filename))) - if diff: - return filename, diff + return filename, diff if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Runs clang format on all of the source " - "files. If --fix is specified, and compares the output " - "with the existing file, outputting a unifiied diff if " - "there are any necessary changes") - parser.add_argument("clang_format_binary", + description="Runs clang-format on all of the source " + "files. If --fix is specified enforce format by " + "modifying in place, otherwise compare the output " + "with the existing file and output any necessary " + "changes as a patch in unified diff format") + parser.add_argument("--clang_format_binary", + required=True, help="Path to the clang-format binary") - parser.add_argument("exclude_globs", + parser.add_argument("--exclude_globs", help="Filename containing globs for files " "that should be excluded from the checks") - parser.add_argument("source_dir", + parser.add_argument("--source_dir", + required=True, help="Root directory of the source code") parser.add_argument("--fix", default=False, action="store_true", @@ -93,47 +72,67 @@ def run(self, filename): parser.add_argument("--quiet", default=False, action="store_true", help="If specified, only print errors") - arguments = parser.parse_args() + exclude_globs = [] + if arguments.exclude_globs: + for line in open(arguments.exclude_globs): + exclude_globs.append(line.strip()) + formatted_filenames = [] - exclude_globs = [line.strip() for line in open(arguments.exclude_globs)] - for directory, subdirs, filenames in os.walk(arguments.source_dir): - fullpaths = (os.path.join(directory, filename) - for filename in filenames) - source_files = [x for x in fullpaths - if x.endswith(".h") or - x.endswith(".cc") or - x.endswith(".cpp")] - formatted_filenames.extend( - # Filter out files that match the globs in the globs file - [filename for filename in source_files - if not any((fnmatch.fnmatch(filename, exclude_glob) - for exclude_glob in exclude_globs))]) + for path in lintutils.get_sources(arguments.source_dir, exclude_globs): + formatted_filenames.append(str(path)) - error = False if arguments.fix: if not arguments.quiet: - # Print out each file on its own line, but run - # clang format once for all of the files print("\n".join(map(lambda x: "Formatting {}".format(x), formatted_filenames))) - subprocess.check_call([arguments.clang_format_binary, - "-i"] + formatted_filenames) + + # Break clang-format invocations into chunks: each invocation formats + # 16 files. Wait for all processes to complete + results = lintutils.run_parallel([ + [arguments.clang_format_binary, "-i"] + some + for some in lintutils.chunk(formatted_filenames, 16) + ]) + for returncode, stdout, stderr in results: + # if any clang-format reported a parse error, bubble it + if returncode != 0: + sys.exit(returncode) + else: - checker = FileChecker(arguments) + # run an instance of clang-format for each source file in parallel, + # then wait for all processes to complete + results = lintutils.run_parallel([ + [arguments.clang_format_binary, filename] + for filename in formatted_filenames + ], stdout=PIPE, stderr=PIPE) + for returncode, stdout, stderr in results: + # if any clang-format reported a parse error, bubble it + if returncode != 0: + sys.exit(returncode) + + error = False + checker = partial(_check_one_file, { + filename: result + for filename, result in zip(formatted_filenames, results) + }) pool = mp.Pool() try: - for res in pool.imap(checker.run, formatted_filenames): - if res is not None: - filename, diff = res + # check the output from each invocation of clang-format in parallel + for filename, diff in pool.imap(checker, formatted_filenames): + if not arguments.quiet: + print("Checking {}".format(filename)) + if diff: print("{} had clang-format style issues".format(filename)) # Print out the diff to stderr error = True + # pad with a newline + print(file=sys.stderr) sys.stderr.writelines(diff) + except Exception: + error = True + raise finally: pool.terminate() pool.join() - - - sys.exit(1 if error else 0) + sys.exit(1 if error else 0) diff --git a/cpp/build-support/run_clang_tidy.py b/cpp/build-support/run_clang_tidy.py new file mode 100755 index 0000000000000..57a3e91bd1c15 --- /dev/null +++ b/cpp/build-support/run_clang_tidy.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import print_function +import argparse +import multiprocessing as mp +import lintutils +from subprocess import PIPE +import sys +from functools import partial + + +def _get_chunk_key(filenames): + # lists are not hashable so key on the first filename in a chunk + return filenames[0] + + +# clang-tidy outputs complaints in '/path:line_number: complaint' format, +# so we can scan its output to get a list of files to fix +def _check_some_files(completed_processes, filenames): + result = completed_processes[_get_chunk_key(filenames)] + return lintutils.stdout_pathcolonline(result, filenames) + + +def _check_all(cmd, filenames): + # each clang-tidy instance will process 16 files + chunks = lintutils.chunk(filenames, 16) + cmds = [cmd + some for some in chunks] + results = lintutils.run_parallel(cmds, stderr=PIPE, stdout=PIPE) + error = False + # record completed processes (keyed by the first filename in the input + # chunk) for lookup in _check_some_files + completed_processes = { + _get_chunk_key(some): result + for some, result in zip(chunks, results) + } + checker = partial(_check_some_files, completed_processes) + pool = mp.Pool() + try: + # check output of completed clang-tidy invocations in parallel + for problem_files, stdout in pool.imap(checker, chunks): + if problem_files: + msg = "clang-tidy suggested fixes for {}" + print("\n".join(map(msg.format, problem_files))) + error = True + except Exception: + error = True + raise + finally: + pool.terminate() + pool.join() + + if error: + sys.exit(1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Runs clang-tidy on all ") + parser.add_argument("--clang_tidy_binary", + required=True, + help="Path to the clang-tidy binary") + parser.add_argument("--exclude_globs", + help="Filename containing globs for files " + "that should be excluded from the checks") + parser.add_argument("--compile_commands", + required=True, + help="compile_commands.json to pass clang-tidy") + parser.add_argument("--source_dir", + required=True, + help="Root directory of the source code") + parser.add_argument("--fix", default=False, + action="store_true", + help="If specified, will attempt to fix the " + "source code instead of recommending fixes, " + "defaults to %(default)s") + parser.add_argument("--quiet", default=False, + action="store_true", + help="If specified, only print errors") + arguments = parser.parse_args() + + linted_filenames = [] + for path in lintutils.get_sources(arguments.source_dir): + linted_filenames.append(path) + + if not arguments.quiet: + msg = 'Tidying {}' if arguments.fix else 'Checking {}' + print("\n".join(map(msg.format, linted_filenames))) + + cmd = [ + arguments.clang_tidy_binary, + '-p', + arguments.compile_commands + ] + if arguments.fix: + cmd.append('-fix') + results = lintutils.run_parallel( + [cmd + some for some in lintutils.chunk(linted_filenames, 16)]) + for result in results: + result.check_returncode() + + else: + _check_all(cmd, linted_filenames) diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py new file mode 100755 index 0000000000000..035a02edd1978 --- /dev/null +++ b/cpp/build-support/run_cpplint.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import print_function +import lintutils +from subprocess import PIPE, STDOUT +import argparse +import multiprocessing as mp +import sys +import platform +from functools import partial + + +_filters = ''' +-whitespace/comments +-readability/todo +-build/header_guard +-build/c++11 +-runtime/references +-build/include_order +'''.split() + + +def _get_chunk_key(filenames): + # lists are not hashable so key on the first filename in a chunk + return filenames[0] + + +def _check_some_files(completed_processes, filenames): + # cpplint outputs complaints in '/path:line_number: complaint' format, + # so we can scan its output to get a list of files to fix + result = completed_processes[_get_chunk_key(filenames)] + return lintutils.stdout_pathcolonline(result, filenames) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Runs cpplint on all of the source files.") + parser.add_argument("--cpplint_binary", + required=True, + help="Path to the cpplint binary") + parser.add_argument("--exclude_globs", + help="Filename containing globs for files " + "that should be excluded from the checks") + parser.add_argument("--source_dir", + required=True, + help="Root directory of the source code") + parser.add_argument("--quiet", default=False, + action="store_true", + help="If specified, only print errors") + arguments = parser.parse_args() + + exclude_globs = [] + if arguments.exclude_globs: + for line in open(arguments.exclude_globs): + exclude_globs.append(line.strip()) + + linted_filenames = [] + for path in lintutils.get_sources(arguments.source_dir, exclude_globs): + linted_filenames.append(str(path)) + + cmd = [ + arguments.cpplint_binary, + '--verbose=2', + '--linelength=90', + '--filter=' + ','.join(_filters) + ] + if (arguments.cpplint_binary.endswith('.py') and + platform.system() == 'Windows'): + # Windows doesn't support executable scripts; execute with + # sys.executable + cmd.insert(0, sys.executable) + if arguments.quiet: + cmd.append('--quiet') + else: + print("\n".join(map(lambda x: "Linting {}".format(x), + linted_filenames))) + + # lint files in chunks: each invocation of cpplint will process 16 files + chunks = lintutils.chunk(linted_filenames, 16) + cmds = [cmd + some for some in chunks] + results = lintutils.run_parallel(cmds, stdout=PIPE, stderr=STDOUT) + + error = False + # record completed processes (keyed by the first filename in the input + # chunk) for lookup in _check_some_files + completed_processes = { + _get_chunk_key(filenames): result + for filenames, result in zip(chunks, results) + } + checker = partial(_check_some_files, completed_processes) + pool = mp.Pool() + try: + # scan the outputs of various cpplint invocations in parallel to + # distill a list of problematic files + for problem_files, stdout in pool.imap(checker, chunks): + if problem_files: + msg = "{} had cpplint issues" + print("\n".join(map(msg.format, problem_files))) + print(stdout, file=sys.stderr) + error = True + except Exception: + error = True + raise + finally: + pool.terminate() + pool.join() + + sys.exit(1 if error else 0) diff --git a/cpp/build-support/tsan-suppressions.txt b/cpp/build-support/tsan-suppressions.txt new file mode 100644 index 0000000000000..ce897c8591188 --- /dev/null +++ b/cpp/build-support/tsan-suppressions.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Thread leak in CUDA +thread:libcuda.so diff --git a/c_glib/tool/Makefile.am b/cpp/build-support/ubsan-suppressions.txt similarity index 95% rename from c_glib/tool/Makefile.am rename to cpp/build-support/ubsan-suppressions.txt index 5d7498b957520..13a83393a9124 100644 --- a/c_glib/tool/Makefile.am +++ b/cpp/build-support/ubsan-suppressions.txt @@ -14,6 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -EXTRA_DIST = \ - get-version.py diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 916b9ebddb88e..1591d864a1a7c 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -97,7 +97,9 @@ function(ADD_ARROW_LIB LIB_NAME) SHARED_PRIVATE_LINK_LIBS EXTRA_INCLUDES PRIVATE_INCLUDES - DEPENDENCIES) + DEPENDENCIES + SHARED_INSTALL_INTERFACE_LIBS + STATIC_INSTALL_INTERFACE_LIBS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") @@ -119,9 +121,11 @@ function(ADD_ARROW_LIB LIB_NAME) set(BUILD_STATIC ${ARROW_BUILD_STATIC}) endif() - if(MSVC) + if(MSVC OR (CMAKE_GENERATOR STREQUAL Xcode)) # MSVC needs to compile C++ separately for each library kind (shared and static) # because of dllexport declarations + # The Xcode generator doesn't reliably work with Xcode as target names are not + # guessed correctly. set(LIB_DEPS ${ARG_SOURCES}) set(EXTRA_DEPS ${ARG_DEPENDENCIES}) @@ -180,11 +184,14 @@ function(ADD_ARROW_LIB LIB_NAME) ${ARG_PRIVATE_INCLUDES}) endif() - if(APPLE) + if(APPLE AND NOT DEFINED $ENV{EMSCRIPTEN}) # On OS X, you can avoid linking at library load time and instead # expecting that the symbols have been loaded separately. This happens # with libpython* where there can be conflicts between system Python and # the Python from a thirdparty distribution + # + # When running with the Emscripten Compiler, we need not worry about + # python, and the Emscripten Compiler does not support this option. set(ARG_SHARED_LINK_FLAGS "-undefined dynamic_lookup ${ARG_SHARED_LINK_FLAGS}") endif() @@ -199,8 +206,16 @@ function(ADD_ARROW_LIB LIB_NAME) VERSION "${ARROW_FULL_SO_VERSION}" SOVERSION "${ARROW_SO_VERSION}") + if (ARG_SHARED_INSTALL_INTERFACE_LIBS) + set(INTERFACE_LIBS ${ARG_SHARED_INSTALL_INTERFACE_LIBS}) + else() + set(INTERFACE_LIBS ${ARG_SHARED_LINK_LIBS}) + endif() + target_link_libraries(${LIB_NAME}_shared - LINK_PUBLIC ${ARG_SHARED_LINK_LIBS} + LINK_PUBLIC + "$" + "$" LINK_PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS}) if (ARROW_RPATH_ORIGIN) @@ -226,10 +241,12 @@ function(ADD_ARROW_LIB LIB_NAME) endif() install(TARGETS ${LIB_NAME}_shared + ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if (BUILD_STATIC) @@ -268,14 +285,24 @@ function(ADD_ARROW_LIB LIB_NAME) LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" OUTPUT_NAME ${LIB_NAME_STATIC}) + if (ARG_STATIC_INSTALL_INTERFACE_LIBS) + set(INTERFACE_LIBS ${ARG_STATIC_INSTALL_INTERFACE_LIBS}) + else() + set(INTERFACE_LIBS ${ARG_STATIC_LINK_LIBS}) + endif() + target_link_libraries(${LIB_NAME}_static - LINK_PUBLIC ${ARG_STATIC_LINK_LIBS}) + LINK_PUBLIC + "$" + "$") install(TARGETS ${LIB_NAME}_static + ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() # Modify variable in calling scope @@ -290,7 +317,7 @@ endfunction() ############################################################ # Add a new micro benchmark, with or without an executable that should be built. # If benchmarks are enabled then they will be run along side unit tests with ctest. -# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# 'make benchmark' and 'make unittest' to build/run only benchmark or unittests, # respectively. # # REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component @@ -306,10 +333,13 @@ endfunction() # \arg PREFIX a string to append to the name of the benchmark executable. For # example, if you have src/arrow/foo/bar-benchmark.cc, then PREFIX "foo" will # create test executable foo-bar-benchmark -function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) +# \arg LABELS the benchmark label or labels to assign the unit tests to. By +# default, benchmarks will go in the "benchmark" group. Custom targets for the +# group names must exist +function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(options) set(one_value_args) - set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + set(multi_value_args EXTRA_LINK_LIBS STATIC_LINK_LIBS DEPENDENCIES PREFIX LABELS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") @@ -328,12 +358,18 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) # This benchmark has a corresponding .cc file, set it up as an executable. set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") - target_link_libraries(${BENCHMARK_NAME} ${ARROW_BENCHMARK_LINK_LIBS}) - add_dependencies(runbenchmark ${BENCHMARK_NAME}) + + if (ARG_STATIC_LINK_LIBS) + # Customize link libraries + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) + else() + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARROW_BENCHMARK_LINK_LIBS}) + endif() + add_dependencies(benchmark ${BENCHMARK_NAME}) set(NO_COLOR "--color_print=false") if (ARG_EXTRA_LINK_LIBS) - target_link_libraries(${BENCHMARK_NAME} ${ARG_EXTRA_LINK_LIBS}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS}) endif() else() # No executable, just invoke the benchmark (probably a script) directly. @@ -341,13 +377,27 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) set(NO_COLOR "") endif() + # Add test as dependency of relevant label targets + add_dependencies(all-benchmarks ${BENCHMARK_NAME}) + foreach (TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${BENCHMARK_NAME}) + endforeach() + if (ARG_DEPENDENCIES) add_dependencies(${BENCHMARK_NAME} ${ARG_DEPENDENCIES}) endif() + if (ARG_LABELS) + set(ARG_LABELS "benchmark;${ARG_LABELS}") + else() + set(ARG_LABELS benchmark) + endif() + add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) - set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") + set_property(TEST ${BENCHMARK_NAME} + APPEND PROPERTY + LABELS ${ARG_LABELS}) endfunction() ############################################################ @@ -375,9 +425,9 @@ endfunction() # \arg LABELS the unit test label or labels to assign the unit tests # to. By default, unit tests will go in the "unittest" group, but if we have # multiple unit tests in some subgroup, you can assign a test to multiple -# groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group +# groups use the syntax unittest;GROUP2;GROUP3. Custom targets for the group # names must exist -function(ADD_ARROW_TEST REL_TEST_NAME) +function(ADD_TEST_CASE REL_TEST_NAME) set(options NO_VALGRIND ENABLED) set(one_value_args) set(multi_value_args SOURCES STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES @@ -387,18 +437,6 @@ function(ADD_ARROW_TEST REL_TEST_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() - if (NOT "${ARROW_TEST_INCLUDE_LABELS}" STREQUAL "") - set(_SKIP_TEST TRUE) - foreach (_INCLUDED_LABEL ${ARG_LABELS}) - if ("${ARG_LABELS}" MATCHES "${_INCLUDED_LABEL}") - set(_SKIP_TEST FALSE) - endif() - endforeach() - if (_SKIP_TEST) - return() - endif() - endif() - if (NO_TESTS AND NOT ARG_ENABLED) return() endif() @@ -408,12 +446,6 @@ function(ADD_ARROW_TEST REL_TEST_NAME) set(TEST_NAME "${ARG_PREFIX}-${TEST_NAME}") endif() - if (ARG_LABELS) - set(ARG_LABELS "${ARG_LABELS}") - else() - set(ARG_LABELS unittest) - endif() - if (ARG_SOURCES) set(SOURCES ${ARG_SOURCES}) else() @@ -444,10 +476,6 @@ function(ADD_ARROW_TEST REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARG_EXTRA_DEPENDENCIES}) endif() - foreach (TEST_LABEL ${ARG_LABELS}) - add_dependencies(${TEST_LABEL} ${TEST_NAME}) - endforeach() - if (ARROW_TEST_MEMCHECK AND NOT ARG_NO_VALGRIND) SET_PROPERTY(TARGET ${TEST_NAME} APPEND_STRING PROPERTY @@ -456,18 +484,89 @@ function(ADD_ARROW_TEST REL_TEST_NAME) bash -c "cd '${CMAKE_SOURCE_DIR}'; \ valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \ --leak-check=full --leak-check-heuristics=stdstring --error-exitcode=1 ${TEST_PATH}") - elseif(MSVC) + elseif(WIN32) add_test(${TEST_NAME} ${TEST_PATH}) else() add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) endif() + # Add test as dependency of relevant targets + add_dependencies(all-tests ${TEST_NAME}) + foreach (TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${TEST_NAME}) + endforeach() + + if (ARG_LABELS) + set(ARG_LABELS "unittest;${ARG_LABELS}") + else() + set(ARG_LABELS unittest) + endif() + set_property(TEST ${TEST_NAME} APPEND PROPERTY LABELS ${ARG_LABELS}) endfunction() +############################################################ +# Examples +############################################################ +# Add a new example, with or without an executable that should be built. +# If examples are enabled then they will be run along side unit tests with ctest. +# 'make runexample' to build/run only examples. +# +# REL_EXAMPLE_NAME is the name of the example app. It may be a single component +# (e.g. monotime-example) or contain additional components (e.g. +# net/net_util-example). Either way, the last component must be a globally +# unique name. + +# The example will registered as unit test with ctest with a label +# of 'example'. +# +# Arguments after the test name will be passed to set_tests_properties(). +# +# \arg PREFIX a string to append to the name of the example executable. For +# example, if you have src/arrow/foo/bar-example.cc, then PREFIX "foo" will +# create test executable foo-bar-example +function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) + set(options) + set(one_value_args) + set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NO_EXAMPLES) + return() + endif() + get_filename_component(EXAMPLE_NAME ${REL_EXAMPLE_NAME} NAME_WE) + + if(ARG_PREFIX) + set(EXAMPLE_NAME "${ARG_PREFIX}-${EXAMPLE_NAME}") + endif() + + if(EXISTS ${CMAKE_SOURCE_DIR}/examples/arrow/${REL_EXAMPLE_NAME}.cc) + # This example has a corresponding .cc file, set it up as an executable. + set(EXAMPLE_PATH "${EXECUTABLE_OUTPUT_PATH}/${EXAMPLE_NAME}") + add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc") + target_link_libraries(${EXAMPLE_NAME} ${ARROW_EXAMPLE_LINK_LIBS}) + add_dependencies(runexample ${EXAMPLE_NAME}) + set(NO_COLOR "--color_print=false") + + if (ARG_EXTRA_LINK_LIBS) + target_link_libraries(${EXAMPLE_NAME} ${ARG_EXTRA_LINK_LIBS}) + endif() + endif() + + if (ARG_DEPENDENCIES) + add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES}) + endif() + + add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH}) + set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") +endfunction() + ############################################################ # Fuzzing ############################################################ @@ -497,3 +596,36 @@ function(ADD_ARROW_FUZZING REL_FUZZING_NAME) PROPERTIES LINK_FLAGS "-fsanitize=fuzzer") endfunction() + +################################################### + +function(ARROW_INSTALL_ALL_HEADERS PATH) + set(options) + set(one_value_args) + set(multi_value_args PATTERN) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (NOT ARG_PATTERN) + # The .hpp extension is used by some vendored libraries + set(ARG_PATTERN "*.h" "*.hpp") + endif() + file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) + + set(PUBLIC_HEADERS) + foreach(HEADER ${CURRENT_DIRECTORY_HEADERS}) + if (NOT ((HEADER MATCHES "internal"))) + LIST(APPEND PUBLIC_HEADERS ${HEADER}) + endif() + endforeach() + install(FILES + ${PUBLIC_HEADERS} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PATH}") +endfunction() + +function(ARROW_ADD_PKG_CONFIG MODULE) + configure_file(${MODULE}.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + @ONLY) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +endfunction() diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index 2ddf7880ceb43..55b425fcbcfe4 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -17,11 +17,15 @@ # # find_package(ClangTools) # -# Variables used by this module, they can change the default behaviour and need +# Variables used by this module which can change the default behaviour and need # to be set before calling find_package: # -# ClangToolsBin_HOME - -# When set, this path is inspected instead of standard library binary locations +# CLANG_FORMAT_VERSION - +# The version of clang-format to find. If this is not specified, clang-format +# will not be searched for. +# +# ClangTools_PATH - +# When set, this path is inspected in addition to standard library binary locations # to find clang-tidy and clang-format # # This module defines @@ -45,6 +49,13 @@ else() endif() endif() +set(CLANG_TOOLS_SEARCH_PATHS + ${ClangTools_PATH} + $ENV{CLANG_TOOLS_PATH} + /usr/local/bin /usr/bin + "C:/Program Files/LLVM/bin" + "${HOMEBREW_PREFIX}/bin") + find_program(CLANG_TIDY_BIN NAMES clang-tidy-4.0 clang-tidy-3.9 @@ -52,33 +63,29 @@ find_program(CLANG_TIDY_BIN clang-tidy-3.7 clang-tidy-3.6 clang-tidy - PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin" - NO_DEFAULT_PATH + PATHS ${CLANG_TOOLS_SEARCH_PATHS} NO_DEFAULT_PATH ) if ( "${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND" ) set(CLANG_TIDY_FOUND 0) - message("clang-tidy not found") + message(STATUS "clang-tidy not found") else() set(CLANG_TIDY_FOUND 1) - message("clang-tidy found at ${CLANG_TIDY_BIN}") + message(STATUS "clang-tidy found at ${CLANG_TIDY_BIN}") endif() if (CLANG_FORMAT_VERSION) find_program(CLANG_FORMAT_BIN NAMES clang-format-${CLANG_FORMAT_VERSION} - PATHS - ${ClangTools_PATH} - $ENV{CLANG_TOOLS_PATH} - /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin" - NO_DEFAULT_PATH + PATHS ${CLANG_TOOLS_SEARCH_PATHS} NO_DEFAULT_PATH ) # If not found yet, search alternative locations - if (("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") AND APPLE) + if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") + STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") + if (APPLE) # Homebrew ships older LLVM versions in /usr/local/opt/llvm@version/ - STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") - STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0") find_program(CLANG_FORMAT_BIN NAMES clang-format @@ -102,24 +109,29 @@ if (CLANG_FORMAT_VERSION) NO_DEFAULT_PATH ) endif() + else() + # try searching for "clang-format" and check the version + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS ${CLANG_TOOLS_SEARCH_PATHS} NO_DEFAULT_PATH + ) + if (NOT ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")) + execute_process(COMMAND ${CLANG_FORMAT_BIN} "-version" + OUTPUT_VARIABLE CLANG_FORMAT_FOUND_VERSION_MESSAGE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ("${CLANG_FORMAT_FOUND_VERSION_MESSAGE}" MATCHES "^clang-format version ${CLANG_FORMAT_MAJOR_VERSION}\\.${CLANG_FORMAT_MINOR_VERSION}.*")) + set(CLANG_FORMAT_BIN "CLANG_FORMAT_BIN-NOTFOUND") + endif() + endif() + endif() endif() -else() - find_program(CLANG_FORMAT_BIN - NAMES clang-format-4.0 - clang-format-3.9 - clang-format-3.8 - clang-format-3.7 - clang-format-3.6 - clang-format - PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin" - NO_DEFAULT_PATH - ) + endif() if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" ) set(CLANG_FORMAT_FOUND 0) - message("clang-format not found") + message(STATUS "clang-format not found") else() set(CLANG_FORMAT_FOUND 1) - message("clang-format found at ${CLANG_FORMAT_BIN}") + message(STATUS "clang-format found at ${CLANG_FORMAT_BIN}") endif() diff --git a/cpp/cmake_modules/FindGTest.cmake b/cpp/cmake_modules/FindGTest.cmake index 8a31ae6e06357..c7496c6a3b9f1 100644 --- a/cpp/cmake_modules/FindGTest.cmake +++ b/cpp/cmake_modules/FindGTest.cmake @@ -28,7 +28,9 @@ # GTEST_INCLUDE_DIR, directory containing headers # GTEST_LIBS, directory containing gtest libraries # GTEST_STATIC_LIB, path to libgtest.a +# GTEST_STATIC_MAIN_LIB, path to libgtest_main.a # GTEST_SHARED_LIB, path to libgtest's shared library +# GTEST_SHARED_MAIN_LIB, path to libgtest_main's shared library # GTEST_FOUND, whether gtest has been found if( NOT "${GTEST_HOME}" STREQUAL "") @@ -38,34 +40,60 @@ elseif ( GTest_HOME ) list( APPEND _gtest_roots ${GTest_HOME} ) endif() +set(GTEST_STATIC_LIB_NAME + ${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}) +set(GTEST_MAIN_STATIC_LIB_NAME + ${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}) +set(GTEST_SHARED_LIB_NAME + ${CMAKE_SHARED_LIBRARY_PREFIX}gtest${CMAKE_SHARED_LIBRARY_SUFFIX}) +set(GTEST_MAIN_SHARED_LIB_NAME + ${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${CMAKE_SHARED_LIBRARY_SUFFIX}) + # Try the parameterized roots, if they exist -if ( _gtest_roots ) - find_path( GTEST_INCLUDE_DIR NAMES gtest/gtest.h - PATHS ${_gtest_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "include" ) - find_library( GTEST_LIBRARIES NAMES gtest gtest_main - PATHS ${_gtest_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "lib" ) -else () - find_path( GTEST_INCLUDE_DIR NAMES gtest/gtest.h ) - find_library( GTEST_LIBRARIES NAMES gtest ) -endif () +if(_gtest_roots) + find_path(GTEST_INCLUDE_DIR NAMES gtest/gtest.h + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include") + set(lib_dirs + "lib/${CMAKE_LIBRARY_ARCHITECTURE}" + "lib64" + "lib") + find_library(GTEST_STATIC_LIB NAMES ${GTEST_STATIC_LIB_NAME} + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + find_library(GTEST_MAIN_STATIC_LIB NAMES ${GTEST_MAIN_STATIC_LIB_NAME} + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + find_library(GTEST_SHARED_LIB NAMES ${GTEST_SHARED_LIB_NAME} + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + find_library(GTEST_MAIN_SHARED_LIB NAMES ${GTEST_MAIN_SHARED_LIB_NAME} + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) +else() + find_path(GTEST_INCLUDE_DIR NAMES gtest/gtest.h) + find_library(GTEST_STATIC_LIB NAMES ${GTEST_STATIC_LIB_NAME}) + find_library(GTEST_MAIN_STATIC_LIB NAMES ${GTEST_MAIN_STATIC_LIB_NAME}) + find_library(GTEST_SHARED_LIB NAMES ${GTEST_SHARED_LIB_NAME}) + find_library(GTEST_MAIN_SHARED_LIB NAMES ${GTEST_MAIN_SHARED_LIB_NAME}) +endif() -if (GTEST_INCLUDE_DIR AND GTEST_LIBRARIES) +if(GTEST_INCLUDE_DIR AND + (GTEST_STATIC_LIB AND GTEST_MAIN_STATIC_LIB) OR + (GTEST_SHARED_LIB AND GTEST_MAIN_SHARED_LIB)) set(GTEST_FOUND TRUE) - get_filename_component( GTEST_LIBS ${GTEST_LIBRARIES} PATH ) - set(GTEST_LIB_NAME gtest) - set(GTEST_STATIC_LIB ${GTEST_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${GTEST_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}) - set(GTEST_MAIN_STATIC_LIB ${GTEST_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${GTEST_LIB_NAME}_main${CMAKE_STATIC_LIBRARY_SUFFIX}) - set(GTEST_SHARED_LIB ${GTEST_LIBS}/${CMAKE_SHARED_LIBRARY_PREFIX}${GTEST_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) -else () +else() set(GTEST_FOUND FALSE) -endif () +endif() if (GTEST_FOUND) if (NOT GTest_FIND_QUIETLY) - message(STATUS "Found the GTest library: ${GTEST_LIBRARIES}") + message(STATUS "Found the GTest library:") + message(STATUS "GTEST_STATIC_LIB: ${GTEST_STATIC_LIB}") + message(STATUS "GTEST_MAIN_STATIC_LIB: ${GTEST_MAIN_STATIC_LIB}") + message(STATUS "GTEST_SHARED_LIB: ${GTEST_SHARED_LIB}") + message(STATUS "GTEST_MAIN_SHARED_LIB: ${GTEST_MAIN_SHARED_LIB}") endif () else () if (NOT GTest_FIND_QUIETLY) @@ -86,7 +114,8 @@ endif () mark_as_advanced( GTEST_INCLUDE_DIR GTEST_LIBS - GTEST_LIBRARIES GTEST_STATIC_LIB + GTEST_MAIN_STATIC_LIB GTEST_SHARED_LIB + GTEST_MAIN_SHARED_LIB ) diff --git a/cpp/cmake_modules/FindInferTools.cmake b/cpp/cmake_modules/FindInferTools.cmake index 00c6709c67703..e2d10209b2a0e 100644 --- a/cpp/cmake_modules/FindInferTools.cmake +++ b/cpp/cmake_modules/FindInferTools.cmake @@ -38,8 +38,8 @@ find_program(INFER_BIN if ( "${INFER_BIN}" STREQUAL "INFER_BIN-NOTFOUND" ) set(INFER_FOUND 0) - message("infer not found") + message(STATUS "infer not found") else() set(INFER_FOUND 1) - message("infer found at ${INFER_BIN}") + message(STATUS "infer found at ${INFER_BIN}") endif() diff --git a/cpp/cmake_modules/FindLLVM.cmake b/cpp/cmake_modules/FindLLVM.cmake index 4094162a1d9dc..edc1b48888ace 100644 --- a/cpp/cmake_modules/FindLLVM.cmake +++ b/cpp/cmake_modules/FindLLVM.cmake @@ -21,9 +21,23 @@ # set(GANDIVA_LLVM_VERSION 6.0) + +if (APPLE) + # Also look in homebrew for a matching llvm version + find_program(BREW_BIN brew) + if (BREW_BIN) + execute_process( + COMMAND ${BREW_BIN} --prefix "llvm@6" + OUTPUT_VARIABLE LLVM_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + endif() +endif() + find_package(LLVM ${GANDIVA_LLVM_VERSION} REQUIRED CONFIG HINTS /usr/local/opt/llvm /usr/share + ${LLVM_BREW_PREFIX} ${LLVM_DIR}) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") diff --git a/cpp/cmake_modules/FindThrift.cmake b/cpp/cmake_modules/FindThrift.cmake index 540276699148d..cb0f819bd57bd 100644 --- a/cpp/cmake_modules/FindThrift.cmake +++ b/cpp/cmake_modules/FindThrift.cmake @@ -34,6 +34,17 @@ if( NOT "${THRIFT_HOME}" STREQUAL "") list( APPEND _thrift_roots ${_native_path} ) elseif ( Thrift_HOME ) list( APPEND _thrift_roots ${Thrift_HOME} ) +elseif (APPLE) + # Also look in homebrew for a matching llvm version + find_program(BREW_BIN brew) + if (BREW_BIN) + execute_process( + COMMAND ${BREW_BIN} --prefix "thrift" + OUTPUT_VARIABLE THRIFT_BREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + list( APPEND _thrift_roots ${THRIFT_BREW_PREFIX} ) + endif() endif() message(STATUS "THRIFT_HOME: ${THRIFT_HOME}") diff --git a/cpp/cmake_modules/FindgRPC.cmake b/cpp/cmake_modules/FindgRPC.cmake new file mode 100644 index 0000000000000..edf72864a7282 --- /dev/null +++ b/cpp/cmake_modules/FindgRPC.cmake @@ -0,0 +1,101 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if( NOT "${GRPC_HOME}" STREQUAL "") + file (TO_CMAKE_PATH "${GRPC_HOME}" _grpc_path) +endif() + +message (STATUS "GRPC_HOME: ${GRPC_HOME}") + +find_package(gRPC CONFIG) +if (gRPC_FOUND) + message (STATUS "Found CMake installation of gRPC") + get_property(GRPC_INCLUDE_DIR TARGET gRPC::gpr PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + get_property(GPR_STATIC_LIB TARGET gRPC::gpr PROPERTY LOCATION) + get_property(GRPC_STATIC_LIB TARGET gRPC::grpc_unsecure PROPERTY LOCATION) + get_property(GRPCPP_STATIC_LIB TARGET gRPC::grpc++_unsecure PROPERTY LOCATION) + get_property(GRPC_ADDRESS_SORTING_STATIC_LIB + TARGET gRPC::address_sorting PROPERTY LOCATION) + # Get location of grpc_cpp_plugin so we can pass it to protoc + get_property(GRPC_CPP_PLUGIN TARGET gRPC::grpc_cpp_plugin PROPERTY LOCATION) +else() + find_path (GRPC_INCLUDE_DIR grpc/grpc.h HINTS + ${_grpc_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "include") + + set (lib_dirs "lib") + if (EXISTS "${_grpc_path}/lib64") + set (lib_dirs "lib64" ${lib_dirs}) + endif () + if (EXISTS "${_grpc_path}/lib/${CMAKE_LIBRARY_ARCHITECTURE}") + set (lib_dirs "lib/${CMAKE_LIBRARY_ARCHITECTURE}" ${lib_dirs}) + endif () + + find_library (GPR_STATIC_LIB + NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}gpr${CMAKE_STATIC_LIBRARY_SUFFIX}" + PATHS ${_grpc_path} + NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + + # On Debian/Ubuntu, libaddress_sorting is statically linked. + find_library (GRPC_ADDRESS_SORTING_STATIC_LIB + NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}" + PATHS ${_grpc_path} + NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + + find_library (GRPC_STATIC_LIB + NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}grpc${CMAKE_STATIC_LIBRARY_SUFFIX}" + PATHS ${_grpc_path} + NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + + find_library (GRPCPP_STATIC_LIB + NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}" + PATHS ${_grpc_path} + NO_DEFAULT_PATH + PATH_SUFFIXES ${lib_dirs}) + + find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin protoc-gen-grpc-cpp + HINTS ${_grpc_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "bin") +endif() + +if (GRPC_INCLUDE_DIR AND GPR_STATIC_LIB AND GRPC_ADDRESS_SORTING_STATIC_LIB AND + GRPC_STATIC_LIB AND GRPCPP_STATIC_LIB AND GRPC_CPP_PLUGIN) + set (gRPC_FOUND TRUE) +else () + set (gRPC_FOUND FALSE) +endif () + +if (gRPC_FOUND) + message (STATUS "Found the gRPC headers: ${GRPC_INCLUDE_DIR}") +else() + if (_grpc_path) + set (GRPC_ERR_MSG "Could not find gRPC. Looked in ${_grpc_path}.") + else () + set (GRPC_ERR_MSG "Could not find gRPC in system search paths.") + endif() + + if (gRPC_FIND_REQUIRED) + message (FATAL_ERROR "${GRPC_ERR_MSG}") + else () + message (STATUS "${GRPC_ERR_MSG}") + endif () +endif() + +mark_as_advanced ( + GRPC_INCLUDE_DIR + ) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 893ec360d3e55..44ca22f5dacb2 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -25,6 +25,9 @@ CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) # Arm64 compiler flags CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC) +# Support C11 +set(CMAKE_C_STANDARD 11) + # This ensures that things like gnu++11 get passed correctly set(CMAKE_CXX_STANDARD 11) @@ -35,6 +38,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # shared libraries set(CMAKE_POSITION_INDEPENDENT_CODE ON) +# if no build build type is specified, default to debug builds +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif(NOT CMAKE_BUILD_TYPE) +string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) + # compiler flags that are common across debug/release builds if (WIN32) # TODO(wesm): Change usages of C runtime functions that MSVC says are @@ -68,10 +77,10 @@ if (WIN32) if (ARROW_USE_STATIC_CRT) foreach (c_flag CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - string(REPLACE "/MD" "-MT" ${c_flag} "${${c_flag}}") + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + string(REPLACE "/MD" "-MT" ${c_flag} "${${c_flag}}") endforeach() endif() @@ -83,16 +92,26 @@ else() set(CXX_COMMON_FLAGS "") endif() -# Build warning level (CHECKIN, EVERYTHING, etc.) +# BUILD_WARNING_LEVEL add warning/error compiler flags. The possible values are +# - RELEASE: `-Werror` is not provide, thus warning do not halt the build. +# - CHECKIN: Imply `-Werror -Wall` and some other warnings. +# - EVERYTHING: Like `CHECKIN`, but possible extra flags depending on the +# compiler, including `-Wextra`, `-Weverything`, `-pedantic`. +# This is the most aggressive warning level. -# if no build warning level is specified, default to development warning level +# Defaults BUILD_WARNING_LEVEL to `CHECKIN`, unless CMAKE_BUILD_TYPE is +# `RELEASE`, then it will default to `PRODUCTION`. The goal of defaulting to +# `CHECKIN` is to avoid friction with long response time from CI. if (NOT BUILD_WARNING_LEVEL) - set(BUILD_WARNING_LEVEL Production) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(BUILD_WARNING_LEVEL PRODUCTION) + else() + set(BUILD_WARNING_LEVEL CHECKIN) + endif() endif(NOT BUILD_WARNING_LEVEL) +string(TOUPPER ${BUILD_WARNING_LEVEL} BUILD_WARNING_LEVEL) -string(TOUPPER ${BUILD_WARNING_LEVEL} UPPERCASE_BUILD_WARNING_LEVEL) - -if ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") +if ("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") # Pre-checkin builds if ("${COMPILER_FAMILY}" STREQUAL "msvc") string(REPLACE "/W3" "" CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}") @@ -106,7 +125,7 @@ if ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") -Wno-shadow -Wno-switch-enum -Wno-exit-time-destructors \ -Wno-global-constructors -Wno-weak-template-vtables -Wno-undefined-reinterpret-cast \ -Wno-implicit-fallthrough -Wno-unreachable-code-return \ --Wno-float-equal -Wno-missing-prototypes \ +-Wno-float-equal -Wno-missing-prototypes -Wno-documentation-unknown-command \ -Wno-old-style-cast -Wno-covered-switch-default \ -Wno-cast-align -Wno-vla-extension -Wno-shift-sign-overflow \ -Wno-used-but-marked-unused -Wno-missing-variable-declarations \ @@ -141,7 +160,7 @@ if ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") else() message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") endif() -elseif ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING") +elseif ("${BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING") # Pedantic builds for fixing warnings if ("${COMPILER_FAMILY}" STREQUAL "msvc") string(REPLACE "/W3" "" CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS}") @@ -337,11 +356,19 @@ endif() # Debug symbols are stripped for reduced binary size. Add # -DARROW_CXXFLAGS="-g" to add them if (NOT MSVC) - set(C_FLAGS_DEBUG "-ggdb -O0") - set(C_FLAGS_FASTDEBUG "-ggdb -O1") + if(ARROW_GGDB_DEBUG) + set(C_FLAGS_DEBUG "-ggdb -O0") + set(C_FLAGS_FASTDEBUG "-ggdb -O1") + set(CXX_FLAGS_DEBUG "-ggdb -O0") + set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") + else() + set(C_FLAGS_DEBUG "-g -O0") + set(C_FLAGS_FASTDEBUG "-g -O1") + set(CXX_FLAGS_DEBUG "-g -O0") + set(CXX_FLAGS_FASTDEBUG "-g -O1") + endif() + set(C_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CXX_FLAGS_DEBUG "-ggdb -O0") - set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") set(CXX_FLAGS_RELEASE "-O3 -DNDEBUG") endif() @@ -350,18 +377,14 @@ set(C_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") set(CXX_FLAGS_PROFILE_GEN "${CXX_FLAGS_RELEASE} -fprofile-generate") set(CXX_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") -# if no build build type is specified, default to debug builds -if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Debug) -endif(NOT CMAKE_BUILD_TYPE) -string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) # Set compile flags based on the build type. message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})") if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_DEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELWITHDEBINFO") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_FASTDEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_FASTDEBUG}") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9829a4d3fbd80..13a6a472cc4e9 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(toolchain) + # ---------------------------------------------------------------------- # Toolchain linkage options @@ -29,6 +31,7 @@ set(THIRDPARTY_DIR "${arrow_SOURCE_DIR}/thirdparty") if (NOT "$ENV{ARROW_BUILD_TOOLCHAIN}" STREQUAL "") set(BROTLI_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(BZ2_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") + set(CARES_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(DOUBLE_CONVERSION_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(FLATBUFFERS_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(GFLAGS_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") @@ -67,6 +70,10 @@ if (DEFINED ENV{BZ2_HOME}) set(BZ2_HOME "$ENV{BZ2_HOME}") endif() +if (DEFINED ENV{CARES_HOME}) + set(CARES_HOME "$ENV{CARES_HOME}") +endif() + if (DEFINED ENV{DOUBLE_CONVERSION_HOME}) set(DOUBLE_CONVERSION_HOME "$ENV{DOUBLE_CONVERSION_HOME}") endif() @@ -327,14 +334,8 @@ endif() # ---------------------------------------------------------------------- # Find pthreads -if (WIN32) - set(PTHREAD_LIBRARY "PTHREAD_LIBRARY-NOTFOUND") -else() - find_library(PTHREAD_LIBRARY pthread) - message(STATUS "Found pthread: ${PTHREAD_LIBRARY}") - add_library(pthreadshared SHARED IMPORTED) - set_target_properties(pthreadshared PROPERTIES IMPORTED_LOCATION ${PTHREAD_LIBRARY}) -endif() +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) # ---------------------------------------------------------------------- # Add Boost dependencies (code adapted from Apache Kudu (incubating)) @@ -344,6 +345,8 @@ if (MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.70.0" "1.70" + "1.69.0" "1.69" "1.68.0" "1.68" "1.67.0" "1.67" "1.66.0" "1.66" @@ -367,15 +370,16 @@ if (ARROW_BOOST_VENDORED) set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) set(BOOST_REGEX_LIBRARY boost_regex_static) + if (ARROW_BOOST_HEADER_ONLY) set(BOOST_BUILD_PRODUCTS) set(BOOST_CONFIGURE_COMMAND "") set(BOOST_BUILD_COMMAND "") else() set(BOOST_BUILD_PRODUCTS - ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_REGEX_LIBRARY}) + ${BOOST_STATIC_SYSTEM_LIBRARY} + ${BOOST_STATIC_FILESYSTEM_LIBRARY} + ${BOOST_STATIC_REGEX_LIBRARY}) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh" "--prefix=${BOOST_PREFIX}" @@ -401,12 +405,19 @@ if (ARROW_BOOST_VENDORED) ${EP_LOG_OPTIONS}) set(Boost_INCLUDE_DIR "${BOOST_PREFIX}") set(Boost_INCLUDE_DIRS "${BOOST_INCLUDE_DIR}") - add_dependencies(arrow_dependencies boost_ep) + add_dependencies(toolchain boost_ep) else() if (MSVC) # disable autolinking in boost add_definitions(-DBOOST_ALL_NO_LIB) endif() + + if (DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) + # In older versions of CMake (such as 3.2), the system paths for Boost will + # be looked in first even if we set $BOOST_ROOT or pass -DBOOST_ROOT + set(Boost_NO_SYSTEM_PATHS ON) + endif() + if (ARROW_BOOST_USE_SHARED) # Find shared Boost libraries. set(Boost_USE_STATIC_LIBS OFF) @@ -499,15 +510,14 @@ if("${DOUBLE_CONVERSION_HOME}" STREQUAL "") CMAKE_ARGS ${DOUBLE_CONVERSION_CMAKE_ARGS} BUILD_BYPRODUCTS "${DOUBLE_CONVERSION_STATIC_LIB}") set(DOUBLE_CONVERSION_VENDORED 1) + add_dependencies(toolchain double-conversion_ep) else() find_package(double-conversion REQUIRED PATHS "${DOUBLE_CONVERSION_HOME}") set(DOUBLE_CONVERSION_VENDORED 0) endif() -if (DOUBLE_CONVERSION_VENDORED) - add_dependencies(arrow_dependencies double-conversion_ep) -else() +if (NOT DOUBLE_CONVERSION_VENDORED) get_property(DOUBLE_CONVERSION_STATIC_LIB TARGET double-conversion::double-conversion PROPERTY LOCATION) get_property(DOUBLE_CONVERSION_INCLUDE_DIR TARGET double-conversion::double-conversion @@ -523,57 +533,11 @@ message(STATUS "double-conversion include dir: ${DOUBLE_CONVERSION_INCLUDE_DIR}" message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB}") # ---------------------------------------------------------------------- -# Google gtest & gflags - -if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) - add_custom_target(unittest ctest -L unittest) - - if("${GTEST_HOME}" STREQUAL "") - if(APPLE) - set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") - elseif(NOT MSVC) - set(GTEST_CMAKE_CXX_FLAGS "-fPIC") - endif() - string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) - set(GTEST_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}} ${GTEST_CMAKE_CXX_FLAGS}") - - set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") - set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") - set(GTEST_STATIC_LIB - "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GTEST_MAIN_STATIC_LIB - "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GTEST_VENDORED 1) - set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} - -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) - if (MSVC AND NOT ARROW_USE_STATIC_CRT) - set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) - endif() - - ExternalProject_Add(googletest_ep - URL ${GTEST_SOURCE_URL} - BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} - CMAKE_ARGS ${GTEST_CMAKE_ARGS} - ${EP_LOG_OPTIONS}) - else() - find_package(GTest REQUIRED) - set(GTEST_VENDORED 0) - endif() - - message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") - message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") - include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(gtest - STATIC_LIB ${GTEST_STATIC_LIB}) - ADD_THIRDPARTY_LIB(gtest_main - STATIC_LIB ${GTEST_MAIN_STATIC_LIB}) - - if(GTEST_VENDORED) - add_dependencies(gtest_static googletest_ep) - add_dependencies(gtest_main_static googletest_ep) - endif() +# gflags +if(ARROW_BUILD_TESTS OR + ARROW_BUILD_BENCHMARKS OR + (ARROW_USE_GLOG AND GLOG_HOME)) # gflags (formerly Googleflags) command line parsing if("${GFLAGS_HOME}" STREQUAL "") set(GFLAGS_CMAKE_CXX_FLAGS ${EP_CXX_FLAGS}) @@ -626,10 +590,75 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) endif() endif() -if(ARROW_BUILD_BENCHMARKS) - add_custom_target(runbenchmark ctest -L benchmark) +# ---------------------------------------------------------------------- +# Google gtest +if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) + if("${GTEST_HOME}" STREQUAL "") + if(APPLE) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") + elseif(NOT MSVC) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC") + endif() + string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) + set(GTEST_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}} ${GTEST_CMAKE_CXX_FLAGS}") + + set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") + set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") + set(GTEST_STATIC_LIB + "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GTEST_MAIN_STATIC_LIB + "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GTEST_VENDORED 1) + set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} + -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) + if (MSVC AND NOT ARROW_USE_STATIC_CRT) + set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) + endif() + + ExternalProject_Add(googletest_ep + URL ${GTEST_SOURCE_URL} + BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} + CMAKE_ARGS ${GTEST_CMAKE_ARGS} + ${EP_LOG_OPTIONS}) + else() + find_package(GTest REQUIRED) + set(GTEST_VENDORED 0) + endif() + + message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + if(GTEST_STATIC_LIB) + message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") + ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) + ADD_THIRDPARTY_LIB(gtest_main + STATIC_LIB ${GTEST_MAIN_STATIC_LIB}) + set(GTEST_LIBRARY gtest_static) + set(GTEST_MAIN_LIBRARY gtest_main_static) + else() + message(STATUS "GTest shared library: ${GTEST_SHARED_LIB}") + ADD_THIRDPARTY_LIB(gtest + SHARED_LIB ${GTEST_SHARED_LIB}) + ADD_THIRDPARTY_LIB(gtest_main + SHARED_LIB ${GTEST_MAIN_SHARED_LIB}) + set(GTEST_LIBRARY gtest_shared) + set(GTEST_MAIN_LIBRARY gtest_main_shared) + endif() + + if(GTEST_VENDORED) + add_dependencies(${GTEST_LIBRARY} googletest_ep) + add_dependencies(${GTEST_MAIN_LIBRARY} googletest_ep) + endif() +endif() + +if(ARROW_BUILD_BENCHMARKS) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") + if(CMAKE_VERSION VERSION_LESS 3.6) + message(FATAL_ERROR "Building gbenchmark from source requires at least CMake 3.6") + endif() + if(NOT MSVC) set(GBENCHMARK_CMAKE_CXX_FLAGS "-fPIC -std=c++11 ${EP_CXX_FLAGS}") endif() @@ -664,11 +693,11 @@ if(ARROW_BUILD_BENCHMARKS) message(STATUS "GBenchmark include dir: ${GBENCHMARK_INCLUDE_DIR}") message(STATUS "GBenchmark static library: ${GBENCHMARK_STATIC_LIB}") include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(benchmark + ADD_THIRDPARTY_LIB(gbenchmark STATIC_LIB ${GBENCHMARK_STATIC_LIB}) if(GBENCHMARK_VENDORED) - add_dependencies(benchmark_static gbenchmark_ep) + add_dependencies(gbenchmark_static gbenchmark_ep) endif() endif() @@ -688,6 +717,7 @@ if (ARROW_IPC) ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR) set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include") set(RAPIDJSON_VENDORED 1) + add_dependencies(toolchain rapidjson_ep) else() set(RAPIDJSON_INCLUDE_DIR "${RAPIDJSON_HOME}/include") set(RAPIDJSON_VENDORED 0) @@ -695,10 +725,6 @@ if (ARROW_IPC) message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}") include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) - if(RAPIDJSON_VENDORED) - add_dependencies(arrow_dependencies rapidjson_ep) - endif() - ## Flatbuffers if("${FLATBUFFERS_HOME}" STREQUAL "") set(FLATBUFFERS_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_ep-prefix/src/flatbuffers_ep-install") @@ -722,15 +748,12 @@ if (ARROW_IPC) set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_PREFIX}/include") set(FLATBUFFERS_COMPILER "${FLATBUFFERS_PREFIX}/bin/flatc") set(FLATBUFFERS_VENDORED 1) + add_dependencies(toolchain flatbuffers_ep) else() find_package(Flatbuffers REQUIRED) set(FLATBUFFERS_VENDORED 0) endif() - if(FLATBUFFERS_VENDORED) - add_dependencies(arrow_dependencies flatbuffers_ep) - endif() - message(STATUS "Flatbuffers include dir: ${FLATBUFFERS_INCLUDE_DIR}") message(STATUS "Flatbuffers compiler: ${FLATBUFFERS_COMPILER}") include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) @@ -760,7 +783,7 @@ if (ARROW_JEMALLOC) ExternalProject_Add(jemalloc_ep URL ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/jemalloc/${JEMALLOC_VERSION}.tar.gz PATCH_COMMAND touch doc/jemalloc.3 doc/jemalloc.html - CONFIGURE_COMMAND ./autogen.sh "--prefix=${JEMALLOC_PREFIX}" "--with-jemalloc-prefix=je_arrow_" "--with-private-namespace=je_arrow_private_" "--disable-tls" + CONFIGURE_COMMAND ./autogen.sh "AR=${CMAKE_AR}" "CC=${CMAKE_C_COMPILER}" "--prefix=${JEMALLOC_PREFIX}" "--with-jemalloc-prefix=je_arrow_" "--with-private-namespace=je_arrow_private_" "--disable-tls" ${EP_LOG_OPTIONS} BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS} @@ -773,7 +796,7 @@ if (ARROW_JEMALLOC) ADD_THIRDPARTY_LIB(jemalloc STATIC_LIB ${JEMALLOC_STATIC_LIB} SHARED_LIB ${JEMALLOC_SHARED_LIB} - DEPS ${PTHREAD_LIBRARY}) + DEPS Threads::Threads) add_dependencies(jemalloc_static jemalloc_ep) endif() @@ -893,6 +916,8 @@ if (ARROW_WITH_SNAPPY) "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}" "-DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS}" "-DCMAKE_C_FLAGS=${EP_C_FLAGS}" + "-DCMAKE_AR=${CMAKE_AR}" + "-DCMAKE_RANLIB=${CMAKE_RANLIB}" "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") set(SNAPPY_UPDATE_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/cmake_modules/SnappyCMakeLists.txt @@ -911,7 +936,7 @@ if (ARROW_WITH_SNAPPY) BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") else() ExternalProject_Add(snappy_ep - CONFIGURE_COMMAND ./configure --with-pic "--prefix=${SNAPPY_PREFIX}" ${SNAPPY_CXXFLAGS} + CONFIGURE_COMMAND ./configure --with-pic "AR=${CMAKE_AR}" "RANLIB=${CMAKE_RANLIB}" "--prefix=${SNAPPY_PREFIX}" ${SNAPPY_CXXFLAGS} ${EP_LOG_OPTIONS} BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} @@ -1033,7 +1058,7 @@ if (ARROW_WITH_LZ4) set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_gl_runtimelibrary_params.patch) else() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") - set(LZ4_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh) + set(LZ4_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh "AR=${CMAKE_AR}") endif() ExternalProject_Add(lz4_ep @@ -1094,6 +1119,11 @@ if (ARROW_WITH_ZSTD) "-DCMAKE_C_FLAGS=${EP_C_FLAGS}") endif() + if(CMAKE_VERSION VERSION_LESS 3.7) + message(FATAL_ERROR "Building zstd using ExternalProject requires \ +at least CMake 3.7") + endif() + ExternalProject_Add(zstd_ep ${EP_LOG_OPTIONS} CMAKE_ARGS ${ZSTD_CMAKE_ARGS} @@ -1139,6 +1169,7 @@ if (ARROW_GANDIVA) CMAKE_ARGS ${RE2_CMAKE_ARGS} BUILD_BYPRODUCTS "${RE2_STATIC_LIB}") set (RE2_VENDORED 1) + add_dependencies(toolchain re2_ep) else () find_package (RE2 REQUIRED) set (RE2_VENDORED 0) @@ -1155,10 +1186,6 @@ if (ARROW_GANDIVA) STATIC_LIB ${RE2_STATIC_LIB}) set(RE2_LIBRARY re2_static) endif() - - if (RE2_VENDORED) - add_dependencies (arrow_dependencies re2_ep) - endif () endif () @@ -1175,7 +1202,7 @@ if (ARROW_ORC OR ARROW_FLIGHT OR ARROW_GANDIVA) set (PROTOBUF_EXECUTABLE "${PROTOBUF_PREFIX}/bin/protoc") ExternalProject_Add(protobuf_ep - CONFIGURE_COMMAND "./configure" "--disable-shared" "--prefix=${PROTOBUF_PREFIX}" "CXXFLAGS=${EP_CXX_FLAGS}" + CONFIGURE_COMMAND "./configure" "AR=${CMAKE_AR}" "RANLIB=${CMAKE_RANLIB}" "CC=${CMAKE_C_COMPILER}" "CXX=${CMAKE_CXX_COMPILER}" "--disable-shared" "--prefix=${PROTOBUF_PREFIX}" "CXXFLAGS=${EP_CXX_FLAGS}" BUILD_IN_SOURCE 1 URL ${PROTOBUF_SOURCE_URL} BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}" "${PROTOBUF_EXECUTABLE}" @@ -1214,7 +1241,9 @@ if (ARROW_FLIGHT) set(GRPC_INCLUDE_DIR "${GRPC_PREFIX}/include") set(GRPC_STATIC_LIBRARY_GPR "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}gpr${CMAKE_STATIC_LIBRARY_SUFFIX}") set(GRPC_STATIC_LIBRARY_GRPC "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}grpc${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_STATIC_LIBRARY_GRPCPP "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}grpcpp${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GRPC_STATIC_LIBRARY_GRPCPP "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GRPC_STATIC_LIBRARY_ADDRESS_SORTING "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GRPC_STATIC_LIBRARY_CARES "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/third_party/cares/cares/lib/${CMAKE_STATIC_LIBRARY_PREFIX}cares${CMAKE_STATIC_LIBRARY_SUFFIX}") set(GRPC_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" "-DCMAKE_C_FLAGS=${EP_C_FLAGS}" @@ -1229,31 +1258,47 @@ if (ARROW_FLIGHT) ${EP_LOG_OPTIONS} CMAKE_ARGS ${GRPC_CMAKE_ARGS} ${EP_LOG_OPTIONS}) - include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) + + set(GPR_STATIC_LIB "${GRPC_STATIC_LIBRARY_GPR}") + set(GRPC_STATIC_LIB "${GRPC_STATIC_LIBRARY_GRPC}") + set(GRPCPP_STATIC_LIB "${GRPC_STATIC_LIBRARY_GRPCPP}") + set(GRPC_ADDRESS_SORTING_STATIC_LIB "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}") + # XXX(wesm): relying on vendored c-ares provided by gRPC for the time being + set(CARES_STATIC_LIB "${GRPC_STATIC_LIBRARY_CARES}") + set(GRPC_CPP_PLUGIN "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/grpc_cpp_plugin") else() - find_package(gRPC CONFIG REQUIRED) + find_package(gRPC REQUIRED) set(GRPC_VENDORED 0) endif() - get_property(GPR_STATIC_LIB TARGET gRPC::gpr PROPERTY LOCATION) + # If we built gRPC ourselves, we should use its c-ares. + if ("${CARES_STATIC_LIB}" STREQUAL "") + if (NOT "${CARES_HOME}" STREQUAL "") + set(CARES_STATIC_LIB "${CARES_HOME}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}cares_static${CMAKE_STATIC_LIBRARY_SUFFIX}") + elseif (c-ares_FOUND) + get_property(CARES_STATIC_LIB TARGET c-ares::cares_static PROPERTY LOCATION) + endif() + endif() + message(STATUS "Found the c-ares library: ${CARES_STATIC_LIB}") + + if ("${GRPC_CPP_PLUGIN}" STREQUAL "") + message(SEND_ERROR "Please set GRPC_CPP_PLUGIN.") + endif() + + include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(grpc_gpr STATIC_LIB ${GPR_STATIC_LIB}) - get_property(GRPC_STATIC_LIB TARGET gRPC::grpc_unsecure PROPERTY LOCATION) ADD_THIRDPARTY_LIB(grpc_grpc STATIC_LIB ${GRPC_STATIC_LIB}) - get_property(GRPCPP_STATIC_LIB TARGET gRPC::grpc++_unsecure PROPERTY LOCATION) ADD_THIRDPARTY_LIB(grpc_grpcpp STATIC_LIB ${GRPCPP_STATIC_LIB}) - get_property(GRPC_ADDRESS_SORTING_STATIC_LIB - TARGET gRPC::address_sorting PROPERTY LOCATION) ADD_THIRDPARTY_LIB(grpc_address_sorting STATIC_LIB ${GRPC_ADDRESS_SORTING_STATIC_LIB}) - # XXX(wesm): relying on vendored c-ares provided by gRPC for the time being - get_property(CARES_STATIC_LIB TARGET c-ares::cares_static PROPERTY LOCATION) ADD_THIRDPARTY_LIB(cares STATIC_LIB ${CARES_STATIC_LIB}) endif() @@ -1301,6 +1346,8 @@ if (ARROW_ORC) CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS}) + add_dependencies(toolchain orc_ep) + set(ORC_VENDORED 1) add_dependencies(orc_ep ${ZLIB_LIBRARY}) if (LZ4_VENDORED) @@ -1326,7 +1373,6 @@ if (ARROW_ORC) if (ORC_VENDORED) add_dependencies(orc_static orc_ep) endif() - endif() # ---------------------------------------------------------------------- @@ -1413,10 +1459,31 @@ if (NOT THRIFT_FOUND) "-DWITH_PLUGIN=OFF" ${THRIFT_CMAKE_ARGS}) elseif (APPLE) - if (DEFINED BISON_EXECUTABLE) - set(THRIFT_CMAKE_ARGS "-DBISON_EXECUTABLE=${BISON_EXECUTABLE}" - ${THRIFT_CMAKE_ARGS}) + # Some other process always resets BISON_EXECUTABLE to the system default, + # thus we use our own variable here. + if (NOT DEFINED THRIFT_BISON_EXECUTABLE) + find_package(BISON 2.5.1) + + # In the case where we cannot find a system-wide installation, look for + # homebrew and ask for its bison installation. + if (NOT BISON_FOUND) + find_program(BREW_BIN brew) + if (BREW_BIN) + execute_process( + COMMAND ${BREW_BIN} --prefix bison + OUTPUT_VARIABLE BISON_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + set(BISON_EXECUTABLE "${BISON_PREFIX}/bin/bison") + find_package(BISON 2.5.1) + set(THRIFT_BISON_EXECUTABLE "${BISON_EXECUTABLE}") + endif() + else() + set(THRIFT_BISON_EXECUTABLE "${BISON_EXECUTABLE}") + endif() endif() + set(THRIFT_CMAKE_ARGS "-DBISON_EXECUTABLE=${THRIFT_BISON_EXECUTABLE}" + ${THRIFT_CMAKE_ARGS}) endif() ExternalProject_Add(thrift_ep @@ -1459,7 +1526,7 @@ if (ARROW_USE_GLOG) set(GLOG_STATIC_LIB "${GLOG_BUILD_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}glog${CMAKE_STATIC_LIBRARY_SUFFIX}") set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} -fPIC") - if (PTHREAD_LIBRARY) + if (Threads::Threads) set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread") set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} -fPIC -pthread") endif() @@ -1497,10 +1564,14 @@ if (ARROW_USE_GLOG) message(STATUS "Glog static library: ${GLOG_STATIC_LIB}") include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(glog - STATIC_LIB ${GLOG_STATIC_LIB}) if (GLOG_VENDORED) + ADD_THIRDPARTY_LIB(glog + STATIC_LIB ${GLOG_STATIC_LIB}) add_dependencies(glog_static glog_ep) + else() + ADD_THIRDPARTY_LIB(glog + STATIC_LIB ${GLOG_STATIC_LIB} + DEPS gflags_static) endif() endif() diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index f2de9cf1f7553..22a9b0c8098a0 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -22,19 +22,6 @@ if (${ARROW_USE_ASAN}) ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.8"))) message(SEND_ERROR "Cannot use ASAN without clang or gcc >= 4.8") endif() - - # If UBSAN is also enabled, and we're on clang < 3.5, ensure static linking is - # enabled. Otherwise, we run into https://llvm.org/bugs/show_bug.cgi?id=18211 - if("${ARROW_USE_UBSAN}" AND - "${COMPILER_FAMILY}" STREQUAL "clang" AND - "${COMPILER_VERSION}" VERSION_LESS "3.5") - if("${ARROW_LINK}" STREQUAL "a") - message("Using static linking for ASAN+UBSAN build") - set(ARROW_LINK "s") - elseif("${ARROW_LINK}" STREQUAL "d") - message(SEND_ERROR "Cannot use dynamic linking when ASAN and UBSAN are both enabled") - endif() - endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -DADDRESS_SANITIZER") endif() @@ -49,7 +36,7 @@ if (${ARROW_USE_UBSAN}) ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.9"))) message(SEND_ERROR "Cannot use UBSAN without clang or gcc >= 4.9") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover=all") endif () # Flag to enable thread sanitizer (clang or gcc 4.8) @@ -101,14 +88,7 @@ if ("${ARROW_USE_UBSAN}" OR "${ARROW_USE_ASAN}" OR "${ARROW_USE_TSAN}") # GCC 4.8 and 4.9 (latest as of this writing) don't allow you to specify a # sanitizer blacklist. if("${COMPILER_FAMILY}" STREQUAL "clang") - # Require clang 3.4 or newer; clang 3.3 has issues with TSAN and pthread - # symbol interception. - if("${COMPILER_VERSION}" VERSION_LESS "3.4") - message(SEND_ERROR "Must use clang 3.4 or newer to run a sanitizer build." - " Detected unsupported version ${COMPILER_VERSION}." - " Try using clang from $NATIVE_TOOLCHAIN/.") - endif() - add_definitions("-fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitize-blacklist.txt") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitize-blacklist.txt") else() message(WARNING "GCC does not support specifying a sanitizer blacklist. Known sanitizer check failures will not be suppressed.") endif() diff --git a/cpp/src/arrow/util/string_view/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt similarity index 90% rename from cpp/src/arrow/util/string_view/CMakeLists.txt rename to cpp/examples/arrow/CMakeLists.txt index bae6bdb807d92..6ecb537ad9787 100644 --- a/cpp/src/arrow/util/string_view/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -install(FILES - string_view.hpp - DESTINATION include/arrow/util/string_view) +ADD_ARROW_EXAMPLE(row-wise-conversion-example) diff --git a/cpp/examples/arrow/row-wise-conversion-example.cc b/cpp/examples/arrow/row-wise-conversion-example.cc new file mode 100644 index 0000000000000..db8c28753dbe6 --- /dev/null +++ b/cpp/examples/arrow/row-wise-conversion-example.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +using arrow::DoubleBuilder; +using arrow::Int64Builder; +using arrow::ListBuilder; + +// While we want to use columnar data structures to build efficient operations, we +// often receive data in a row-wise fashion from other systems. In the following, +// we want give a brief introduction into the classes provided by Apache Arrow by +// showing how to transform row-wise data into a columnar table. +// +// The data in this example is stored in the following struct: +struct data_row { + int64_t id; + double cost; + std::vector cost_components; +}; + +// Transforming a vector of structs into a columnar Table. +// +// The final representation should be an `arrow::Table` which in turn is made up of +// an `arrow::Schema` and a list of `arrow::Column`. An `arrow::Column` is again a +// named collection of one or more `arrow::Array` instances. As the first step, we +// will iterate over the data and build up the arrays incrementally. For this task, +// we provide `arrow::ArrayBuilder` classes that help in the construction of the +// final `arrow::Array` instances. +// +// For each type, Arrow has a specially typed builder class. For the primitive +// values `id` and `cost` we can use the respective `arrow::Int64Builder` and +// `arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two +// builders, a top-level `arrow::ListBuilder` that builds the array of offsets and +// a nested `arrow::DoubleBuilder` that constructs the underlying values array that +// is referenced by the offsets in the former array. +arrow::Status VectorToColumnarTable(const std::vector& rows, + std::shared_ptr* table) { + // The builders are more efficient using + // arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of + // the underlying memory regions in-place. At the moment, arrow::jemalloc is only + // supported on Unix systems, not Windows. + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + Int64Builder id_builder(pool); + DoubleBuilder cost_builder(pool); + ListBuilder components_builder(pool, std::make_shared(pool)); + // The following builder is owned by components_builder. + DoubleBuilder& cost_components_builder = + *(static_cast(components_builder.value_builder())); + + // Now we can loop over our existing data and insert it into the builders. The + // `Append` calls here may fail (e.g. we cannot allocate enough additional memory). + // Thus we need to check their return values. For more information on these values, + // check the documentation about `arrow::Status`. + for (const data_row& row : rows) { + ARROW_RETURN_NOT_OK(id_builder.Append(row.id)); + ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost)); + + // Indicate the start of a new list row. This will memorise the current + // offset in the values builder. + ARROW_RETURN_NOT_OK(components_builder.Append()); + // Store the actual values. The final nullptr argument tells the underyling + // builder that all added values are valid, i.e. non-null. + ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(), + row.cost_components.size())); + } + + // At the end, we finalise the arrays, declare the (type) schema and combine them + // into a single `arrow::Table`: + std::shared_ptr id_array; + ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array)); + std::shared_ptr cost_array; + ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array)); + // No need to invoke cost_components_builder.Finish because it is implied by + // the parent builder's Finish invocation. + std::shared_ptr cost_components_array; + ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array)); + + std::vector> schema_vector = { + arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()), + arrow::field("cost_components", arrow::list(arrow::float64()))}; + + auto schema = std::make_shared(schema_vector); + + // The final `table` variable is the one we then can pass on to other functions + // that can consume Apache Arrow memory structures. This object has ownership of + // all referenced data, thus we don't have to care about undefined references once + // we leave the scope of the function building the table and its underlying arrays. + *table = arrow::Table::Make(schema, {id_array, cost_array, cost_components_array}); + + return arrow::Status::OK(); +} + +arrow::Status ColumnarTableToVector(const std::shared_ptr& table, + std::vector* rows) { + // To convert an Arrow table back into the same row-wise representation as in the + // above section, we first will check that the table conforms to our expected + // schema and then will build up the vector of rows incrementally. + // + // For the check if the table is as expected, we can utilise solely its schema. + std::vector> schema_vector = { + arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()), + arrow::field("cost_components", arrow::list(arrow::float64()))}; + auto expected_schema = std::make_shared(schema_vector); + + if (!expected_schema->Equals(*table->schema())) { + // The table doesn't have the expected schema thus we cannot directly + // convert it to our target representation. + return arrow::Status::Invalid("Schemas are not matching!"); + } + + // As we have ensured that the table has the expected structure, we can unpack the + // underlying arrays. For the primitive columns `id` and `cost` we can use the high + // level functions to get the values whereas for the nested column + // `cost_components` we need to access the C-pointer to the data to copy its + // contents into the resulting `std::vector`. Here we need to be care to + // also add the offset to the pointer. This offset is needed to enable zero-copy + // slicing operations. While this could be adjusted automatically for double + // arrays, this cannot be done for the accompanying bitmap as often the slicing + // border would be inside a byte. + + auto ids = + std::static_pointer_cast(table->column(0)->data()->chunk(0)); + auto costs = + std::static_pointer_cast(table->column(1)->data()->chunk(0)); + auto cost_components = + std::static_pointer_cast(table->column(2)->data()->chunk(0)); + auto cost_components_values = + std::static_pointer_cast(cost_components->values()); + // To enable zero-copy slices, the native values pointer might need to account + // for this slicing offset. This is not needed for the higher level functions + // like Value(…) that already account for this offset internally. + const double* ccv_ptr = cost_components_values->data()->GetValues(1); + + for (int64_t i = 0; i < table->num_rows(); i++) { + // Another simplification in this example is that we assume that there are + // no null entries, e.g. each row is fill with valid values. + int64_t id = ids->Value(i); + double cost = costs->Value(i); + const double* first = ccv_ptr + cost_components->value_offset(i); + const double* last = ccv_ptr + cost_components->value_offset(i + 1); + std::vector components_vec(first, last); + rows->push_back({id, cost, components_vec}); + } + + return arrow::Status::OK(); +} + +#define EXIT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + return EXIT_FAILURE; \ + } \ + } while (0); + +int main(int argc, char** argv) { + std::vector rows = { + {1, 1.0, {1.0}}, {2, 2.0, {1.0, 2.0}}, {3, 3.0, {1.0, 2.0, 3.0}}}; + + std::shared_ptr table; + EXIT_ON_FAILURE(VectorToColumnarTable(rows, &table)); + + std::vector expected_rows; + EXIT_ON_FAILURE(ColumnarTableToVector(table, &expected_rows)); + + assert(rows.size() == expected_rows.size()); + + return EXIT_SUCCESS; +} diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 98c5cd9402bb7..db172a2534f37 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -22,7 +22,7 @@ target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) -add_executable(parquet-arrow-example parquet-arrow/src/reader-writer.cc) +add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) target_link_libraries(parquet-arrow-example parquet_shared) add_dependencies(parquet diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt index d9e01acd3eea3..915930ec228e1 100644 --- a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt @@ -38,5 +38,5 @@ find_package(Parquet) include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}) -add_executable(parquet-arrow-example src/reader-writer.cc) +add_executable(parquet-arrow-example reader-writer.cc) target_link_libraries(parquet-arrow-example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB}) diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/reader-writer.cc similarity index 98% rename from cpp/examples/parquet/parquet-arrow/src/reader-writer.cc rename to cpp/examples/parquet/parquet-arrow/reader-writer.cc index 8d474486e7413..a5f928b6d4f69 100644 --- a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc +++ b/cpp/examples/parquet/parquet-arrow/reader-writer.cc @@ -100,7 +100,7 @@ void read_single_column() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; @@ -119,7 +119,7 @@ void read_single_column_chunk() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6858f3c4c4fbe..22ce6e913368e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -15,13 +15,66 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(arrow-all) +add_custom_target(arrow) +add_custom_target(arrow-benchmarks) +add_custom_target(arrow-tests) +add_dependencies(arrow-all arrow arrow-tests arrow-benchmarks) + +# Adding unit tests part of the "arrow" portion of the test suite +function(ADD_ARROW_TEST REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args LABELS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow") + endif() + + if (ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "arrow-tests") + endif() + + ADD_TEST_CASE(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS ${LABELS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +function(ADD_ARROW_BENCHMARK REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow") + endif() + ADD_BENCHMARK(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS "arrow-benchmarks" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + set(ARROW_SRCS array.cc - buffer.cc + builder.cc - builder-adaptive.cc - builder-binary.cc - builder-dict.cc + array/builder_adaptive.cc + array/builder_base.cc + array/builder_binary.cc + array/builder_decimal.cc + array/builder_dict.cc + array/builder_nested.cc + array/builder_primitive.cc + + buffer.cc compare.cc memory_pool.cc pretty_print.cc @@ -30,6 +83,7 @@ set(ARROW_SRCS table.cc table_builder.cc tensor.cc + sparse_tensor.cc type.cc visitor.cc @@ -47,6 +101,7 @@ set(ARROW_SRCS io/memory.cc io/readahead.cc + util/basic_decimal.cc util/bit-util.cc util/compression.cc util/cpu-info.cc @@ -57,6 +112,7 @@ set(ARROW_SRCS util/key_value_metadata.cc util/task-group.cc util/thread-pool.cc + util/trie.cc util/utf8.cc ) @@ -142,6 +198,7 @@ if (ARROW_IPC) ipc/feather.cc ipc/json.cc ipc/json-internal.cc + ipc/json-simple.cc ipc/message.cc ipc/metadata-internal.cc ipc/reader.cc @@ -170,7 +227,11 @@ ADD_ARROW_LIB(arrow SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} SHARED_LINK_LIBS ${ARROW_LINK_LIBS} SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} - STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) + STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS} + SHARED_INSTALL_INTERFACE_LIBS ${ARROW_SHARED_INSTALL_INTERFACE_LIBS} + STATIC_INSTALL_INTERFACE_LIBS ${ARROW_STATIC_INSTALL_INTERFACE_LIBS}) + +add_dependencies(arrow ${ARROW_LIBRARIES}) if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) @@ -181,8 +242,8 @@ if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) ADD_ARROW_LIB(arrow_testing SOURCES test-util.cc OUTPUTS ARROW_TESTING_LIBRARIES - DEPENDENCIES gtest_static - SHARED_LINK_LIBS arrow_shared gtest_static + DEPENDENCIES ${GTEST_LIBRARY} + SHARED_LINK_LIBS arrow_shared ${GTEST_LIBRARY} STATIC_LINK_LIBS arrow_static) if (ARROW_BUILD_STATIC AND WIN32) @@ -199,43 +260,17 @@ find_package(Backtrace) foreach(LIB_TARGET ${ARROW_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) - if (Backtrace_FOUND) + if (Backtrace_FOUND AND ARROW_WITH_BACKTRACE) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_WITH_BACKTRACE) endif() endforeach() # Headers: top level -install(FILES - allocator.h - api.h - array.h - buffer.h - builder.h - compare.h - memory_pool.h - pretty_print.h - record_batch.h - status.h - stl.h - table.h - table_builder.h - tensor.h - type.h - type_fwd.h - type_traits.h - test-util.h - visitor.h - visitor_inline.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow") +ARROW_INSTALL_ALL_HEADERS("arrow") # pkg-config support -configure_file(arrow.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow") ####################################### # Unit tests @@ -255,10 +290,25 @@ ADD_ARROW_TEST(type-test) ADD_ARROW_TEST(table-test) ADD_ARROW_TEST(table_builder-test) ADD_ARROW_TEST(tensor-test) +ADD_ARROW_TEST(sparse_tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) +add_subdirectory(array) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) +add_subdirectory(vendored) + +if(ARROW_FLIGHT) + add_subdirectory(flight) +endif() + +if(ARROW_PYTHON) + add_subdirectory(python) +endif() + +if(ARROW_HIVESERVER2) + add_subdirectory(dbi/hiveserver2) +endif() diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index de803d5ba6f03..01fc09afb0c92 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -206,11 +206,7 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { *out = union_(fields, type_codes); break; } - default: { - std::stringstream ss; - ss << "Unknown Orc type kind: " << kind; - return Status::Invalid(ss.str()); - } + default: { return Status::Invalid("Unknown Orc type kind: ", kind); } } return Status::OK(); } @@ -346,11 +342,9 @@ class ORCFileReader::Impl { } Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) { - if (stripe < 0 || stripe >= NumberOfStripes()) { - std::stringstream ss; - ss << "Out of bounds stripe: " << stripe; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(), + Status::Invalid("Out of bounds stripe: ", stripe)); + opts->range(stripes_[stripe].offset, stripes_[stripe].length); return Status::OK(); } @@ -359,9 +353,7 @@ class ORCFileReader::Impl { const std::vector& include_indices) { std::list include_indices_list; for (auto it = include_indices.begin(); it != include_indices.end(); ++it) { - if (*it < 0) { - return Status::Invalid("Negative field index"); - } + ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index")); include_indices_list.push_back(*it); } opts->includeTypes(include_indices_list); @@ -455,9 +447,7 @@ class ORCFileReader::Impl { case liborc::DECIMAL: return AppendDecimalBatch(type, batch, offset, length, builder); default: - std::stringstream ss; - ss << "Not implemented type kind: " << kind; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type kind: ", kind); } } diff --git a/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt b/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt index db4264b59ab63..5bb5b725910e3 100644 --- a/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt +++ b/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt @@ -15,7 +15,4 @@ # specific language governing permissions and limitations # under the License. -# Headers: top level -install(FILES - convert.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/adapters/tensorflow") +ARROW_INSTALL_ALL_HEADERS("arrow/adapters/tensorflow") diff --git a/cpp/src/arrow/allocator-test.cc b/cpp/src/arrow/allocator-test.cc index cdffbd7e8494f..1a94467281dbc 100644 --- a/cpp/src/arrow/allocator-test.cc +++ b/cpp/src/arrow/allocator-test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/cpp/src/arrow/allocator.h b/cpp/src/arrow/allocator.h index 144ba575063a3..a02b8e64bb05a 100644 --- a/cpp/src/arrow/allocator.h +++ b/cpp/src/arrow/allocator.h @@ -29,6 +29,7 @@ namespace arrow { +/// \brief A STL allocator delegating allocations to a Arrow MemoryPool template class stl_allocator { public: @@ -45,7 +46,9 @@ class stl_allocator { using other = stl_allocator; }; + /// \brief Construct an allocator from the default MemoryPool stl_allocator() noexcept : pool_(default_memory_pool()) {} + /// \brief Construct an allocator from the given MemoryPool explicit stl_allocator(MemoryPool* pool) noexcept : pool_(pool) {} template @@ -86,9 +89,14 @@ class stl_allocator { MemoryPool* pool_; }; +/// \brief A MemoryPool implementation delegating allocations to a STL allocator +/// +/// Note that STL allocators don't provide a resizing operation, and therefore +/// any buffer resizes will do a full reallocation and copy. template > class STLMemoryPool : public MemoryPool { public: + /// \brief Construct a memory pool from the given allocator explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} Status Allocate(int64_t size, uint8_t** out) override { diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc index 4376695c68cba..6f938c82bfd0a 100644 --- a/cpp/src/arrow/array-binary-test.cc +++ b/cpp/src/arrow/array-binary-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -28,10 +26,14 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { @@ -676,4 +678,112 @@ TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality(); } TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } +// ---------------------------------------------------------------------- +// ChunkedBinaryBuilder tests + +class TestChunkedBinaryBuilder : public ::testing::Test { + public: + void SetUp() {} + + void Init(int32_t chunksize) { + builder_.reset(new internal::ChunkedBinaryBuilder(chunksize)); + } + + protected: + std::unique_ptr builder_; +}; + +TEST_F(TestChunkedBinaryBuilder, BasicOperation) { + const int32_t chunksize = 1000; + Init(chunksize); + + const int elem_size = 10; + uint8_t buf[elem_size]; + + BinaryBuilder unchunked_builder; + + const int iterations = 1000; + for (int i = 0; i < iterations; ++i) { + random_bytes(elem_size, i, buf); + + ASSERT_OK(unchunked_builder.Append(buf, elem_size)); + ASSERT_OK(builder_->Append(buf, elem_size)); + } + + std::shared_ptr unchunked; + ASSERT_OK(unchunked_builder.Finish(&unchunked)); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + // This assumes that everything is evenly divisible + ArrayVector expected_chunks; + const int elems_per_chunk = chunksize / elem_size; + for (int i = 0; i < iterations / elems_per_chunk; ++i) { + expected_chunks.emplace_back(unchunked->Slice(i * elems_per_chunk, elems_per_chunk)); + } + + ASSERT_EQ(expected_chunks.size(), chunks.size()); + for (size_t i = 0; i < chunks.size(); ++i) { + AssertArraysEqual(*expected_chunks[i], *chunks[i]); + } +} + +TEST_F(TestChunkedBinaryBuilder, NoData) { + Init(1000); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + ASSERT_EQ(1, chunks.size()); + ASSERT_EQ(0, chunks[0]->length()); +} + +TEST_F(TestChunkedBinaryBuilder, LargeElements) { + Init(100); + + const int bufsize = 101; + uint8_t buf[bufsize]; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + random_bytes(bufsize, i, buf); + ASSERT_OK(builder_->Append(buf, bufsize)); + } + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + ASSERT_EQ(iterations, static_cast(chunks.size())); + + int64_t total_data_size = 0; + for (auto chunk : chunks) { + ASSERT_EQ(1, chunk->length()); + total_data_size += + static_cast(static_cast(*chunk).GetView(0).size()); + } + ASSERT_EQ(iterations * bufsize, total_data_size); +} + +TEST(TestChunkedStringBuilder, BasicOperation) { + const int chunksize = 100; + internal::ChunkedStringBuilder builder(chunksize); + + std::string value = "0123456789"; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + ASSERT_OK(builder.Append(value)); + } + + ArrayVector chunks; + ASSERT_OK(builder.Finish(&chunks)); + + ASSERT_EQ(10, chunks.size()); + + // Type is correct + for (auto chunk : chunks) { + ASSERT_TRUE(chunk->type()->Equals(*::arrow::utf8())); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 4c8dcc067b8c5..5134d1fe927a8 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -15,29 +15,32 @@ // specific language governing permissions and limitations // under the License. -#include +#include #include -#include -#include #include +#include #include #include #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" namespace arrow { using std::string; using std::vector; +using internal::checked_cast; + // ---------------------------------------------------------------------- // Dictionary tests @@ -55,59 +58,40 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.AppendNull()); + + ASSERT_EQ(builder.length(), 4); + ASSERT_EQ(builder.null_count(), 1); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder; - ASSERT_OK(dict_builder.Append(static_cast(1))); - ASSERT_OK(dict_builder.Append(static_cast(2))); - std::shared_ptr dict_array; - ASSERT_OK(dict_builder.Finish(&dict_array)); - auto dtype = std::make_shared(int8(), dict_array); + auto dict_array = ArrayFromJSON(std::make_shared(), "[1, 2]"); + auto dict_type = std::make_shared(int8(), dict_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); + DictionaryArray expected(dict_type, int_array); - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { - NumericBuilder builder; - // DictionaryBuilder builder; - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); + auto type = std::make_shared(); - std::shared_ptr intermediate_result; - ASSERT_OK(builder.Finish(&intermediate_result)); + auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]"); DictionaryBuilder dictionary_builder(default_memory_pool()); ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result)); std::shared_ptr result; ASSERT_OK(dictionary_builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder; - ASSERT_OK(dict_builder.Append(static_cast(1))); - ASSERT_OK(dict_builder.Append(static_cast(2))); - std::shared_ptr dict_array; - ASSERT_OK(dict_builder.Finish(&dict_array)); - auto dtype = std::make_shared(int8(), dict_array); + auto dict_array = ArrayFromJSON(type, "[1, 2]"); + auto dict_type = std::make_shared(int8(), dict_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, int_array); - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } @@ -150,120 +134,74 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { } TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) { + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + DictionaryBuilder builder(default_memory_pool()); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); std::shared_ptr result; FinishAndCheckPadding(&builder, &result); // Build expected data for the initial dictionary - NumericBuilder dict_builder1; - ASSERT_OK(dict_builder1.Append(static_cast(1))); - ASSERT_OK(dict_builder1.Append(static_cast(2))); - std::shared_ptr dict_array1; - ASSERT_OK(dict_builder1.Finish(&dict_array1)); - auto dtype1 = std::make_shared(int8(), dict_array1); + auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]")); + DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]")); - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); - - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // extend the dictionary builder with new data - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(3))); std::shared_ptr result_delta; ASSERT_OK(builder.Finish(&result_delta)); // Build expected data for the delta dictionary - NumericBuilder dict_builder2; - ASSERT_OK(dict_builder2.Append(static_cast(3))); - std::shared_ptr dict_array2; - ASSERT_OK(dict_builder2.Finish(&dict_array2)); - auto dtype2 = std::make_shared(int8(), dict_array2); + auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]")); + DictionaryArray expected_delta(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]")); - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(0)); - ASSERT_OK(int_builder2.Append(2)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); - - DictionaryArray expected_delta(dtype2, int_array2); ASSERT_TRUE(expected_delta.Equals(result_delta)); } TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + DictionaryBuilder builder(default_memory_pool()); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); std::shared_ptr result; FinishAndCheckPadding(&builder, &result); // Build expected data for the initial dictionary - NumericBuilder dict_builder1; - ASSERT_OK(dict_builder1.Append(static_cast(1))); - ASSERT_OK(dict_builder1.Append(static_cast(2))); - std::shared_ptr dict_array1; - ASSERT_OK(dict_builder1.Finish(&dict_array1)); - auto dtype1 = std::make_shared(int8(), dict_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]")); + DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]")); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // extend the dictionary builder with new data - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(3))); std::shared_ptr result_delta1; ASSERT_OK(builder.Finish(&result_delta1)); // Build expected data for the delta dictionary - NumericBuilder dict_builder2; - ASSERT_OK(dict_builder2.Append(static_cast(3))); - std::shared_ptr dict_array2; - ASSERT_OK(dict_builder2.Finish(&dict_array2)); - auto dtype2 = std::make_shared(int8(), dict_array2); + auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]")); + DictionaryArray expected_delta1(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]")); - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(0)); - ASSERT_OK(int_builder2.Append(2)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); - - DictionaryArray expected_delta1(dtype2, int_array2); ASSERT_TRUE(expected_delta1.Equals(result_delta1)); // extend the dictionary builder with new data again @@ -277,23 +215,9 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { ASSERT_OK(builder.Finish(&result_delta2)); // Build expected data for the delta dictionary again - NumericBuilder dict_builder3; - ASSERT_OK(dict_builder3.Append(static_cast(4))); - ASSERT_OK(dict_builder3.Append(static_cast(5))); - std::shared_ptr dict_array3; - ASSERT_OK(dict_builder3.Finish(&dict_array3)); - auto dtype3 = std::make_shared(int8(), dict_array3); - - Int8Builder int_builder3; - ASSERT_OK(int_builder3.Append(0)); - ASSERT_OK(int_builder3.Append(1)); - ASSERT_OK(int_builder3.Append(2)); - ASSERT_OK(int_builder3.Append(3)); - ASSERT_OK(int_builder3.Append(4)); - std::shared_ptr int_array3; - ASSERT_OK(int_builder3.Finish(&int_array3)); + auto dict_type3 = dictionary(int8(), ArrayFromJSON(type, "[4, 5]")); + DictionaryArray expected_delta2(dict_type3, ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]")); - DictionaryArray expected_delta2(dtype3, int_array3); ASSERT_TRUE(expected_delta2.Equals(result_delta2)); } @@ -308,21 +232,27 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder; - ASSERT_OK(str_builder.Append("test")); - ASSERT_OK(str_builder.Append("test2")); - std::shared_ptr str_array; - ASSERT_OK(str_builder.Finish(&str_array)); - auto dtype = std::make_shared(int8(), str_array); + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]")); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dtype, int_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + ASSERT_TRUE(expected.Equals(result)); +} + +// ARROW-4367 +TEST(TestStringDictionaryBuilder, OnlyNull) { + // Build the dictionary Array + StringDictionaryBuilder builder(default_memory_pool()); + ASSERT_OK(builder.AppendNull()); + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + // Build expected data + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[]")); + auto int_array = ArrayFromJSON(int8(), "[null]"); DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); } @@ -373,21 +303,10 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder1; - ASSERT_OK(str_builder1.Append("test")); - ASSERT_OK(str_builder1.Append("test2")); - std::shared_ptr str_array1; - ASSERT_OK(str_builder1.Finish(&str_array1)); - auto dtype1 = std::make_shared(int8(), str_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]")); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dtype, int_array); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // build a delta dictionary @@ -399,20 +318,10 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { FinishAndCheckPadding(&builder, &result_delta); // Build expected data - StringBuilder str_builder2; - ASSERT_OK(str_builder2.Append("test3")); - std::shared_ptr str_array2; - ASSERT_OK(str_builder2.Finish(&str_array2)); - auto dtype2 = std::make_shared(int8(), str_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(1)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); - + auto dtype2 = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test3\"]")); + auto int_array2 = ArrayFromJSON(int8(), "[1, 2, 1]"); DictionaryArray expected_delta(dtype2, int_array2); + ASSERT_TRUE(expected_delta.Equals(result_delta)); } @@ -647,7 +556,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { TEST(TestDecimalDictionaryBuilder, Basic) { // Build the dictionary Array - const auto& decimal_type = arrow::decimal(2, 0); + auto decimal_type = arrow::decimal(2, 0); DictionaryBuilder builder(decimal_type, default_memory_pool()); // Test data @@ -660,20 +569,9 @@ TEST(TestDecimalDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - FixedSizeBinaryBuilder decimal_builder(decimal_type); - ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes())); - ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes())); - - std::shared_ptr decimal_array; - ASSERT_OK(decimal_builder.Finish(&decimal_array)); - auto dtype = arrow::dictionary(int8(), decimal_array); + auto dtype = dictionary(int8(), ArrayFromJSON(decimal_type, "[\"12\", \"11\"]")); + DictionaryArray expected(dtype, ArrayFromJSON(int8(), "[0, 0, 1, 0]")); - Int8Builder int_builder; - ASSERT_OK(int_builder.AppendValues({0, 0, 1, 0})); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); - - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } @@ -758,26 +656,20 @@ TEST(TestDictionary, Basics) { TEST(TestDictionary, Equals) { vector is_valid = {true, true, false, true, true, true}; + std::shared_ptr dict, dict2, indices, indices2, indices3; - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr dict2; - vector dict2_values = {"foo", "bar", "baz", "qux"}; - ArrayFromVector(dict2_values, &dict2); + dict2 = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]"); std::shared_ptr dict2_type = dictionary(int16(), dict2); - std::shared_ptr indices; vector indices_values = {1, 2, -1, 0, 2, 0}; ArrayFromVector(is_valid, indices_values, &indices); - std::shared_ptr indices2; vector indices2_values = {1, 2, 0, 0, 2, 0}; ArrayFromVector(is_valid, indices2_values, &indices2); - std::shared_ptr indices3; vector indices3_values = {1, 1, 0, 0, 2, 0}; ArrayFromVector(is_valid, indices3_values, &indices3); @@ -825,17 +717,10 @@ TEST(TestDictionary, Equals) { } TEST(TestDictionary, Validate) { - vector is_valid = {true, true, false, true, true, true}; - - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr indices; - vector indices_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(is_valid, indices_values, &indices); - + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]"); std::shared_ptr arr = std::make_shared(dict_type, indices); // Only checking index type for now @@ -857,28 +742,20 @@ TEST(TestDictionary, Validate) { } TEST(TestDictionary, FromArray) { - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr indices1; - vector indices_values1 = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(indices_values1, &indices1); - - std::shared_ptr indices2; - vector indices_values2 = {1, 2, 0, 3, 2, 0}; - ArrayFromVector(indices_values2, &indices2); + auto indices1 = ArrayFromJSON(int16(), "[1, 2, 0, 0, 2, 0]"); + auto indices2 = ArrayFromJSON(int16(), "[1, 2, 0, 3, 2, 0]"); + // Invalid index is masked by null std::shared_ptr indices3; vector is_valid3 = {true, true, false, true, true, true}; vector indices_values3 = {1, 2, -1, 0, 2, 0}; ArrayFromVector(is_valid3, indices_values3, &indices3); - std::shared_ptr indices4; - vector is_valid4 = {true, true, false, true, true, true}; - vector indices_values4 = {1, 2, 1, 3, 2, 0}; - ArrayFromVector(is_valid4, indices_values4, &indices4); + // Index out of bounds + auto indices4 = ArrayFromJSON(int16(), "[1, 2, null, 3, 2, 0]"); std::shared_ptr arr1, arr2, arr3, arr4; ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices1, &arr1)); @@ -887,4 +764,63 @@ TEST(TestDictionary, FromArray) { ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4)); } +TEST(TestDictionary, TransposeBasic) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); + // ["B", "C", "A", "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + // Transpose to same index type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*out, *expected); + } + + // Transpose to other type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int8(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); + } +} + +TEST(TestDictionary, TransposeNulls) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]"); + // ["B", "C", null, "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); +} + } // namespace arrow diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 207acd4cf65d7..c49c5e3097058 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -32,6 +30,8 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index dc8bafd4c0071..68c35f57116a8 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -26,12 +24,12 @@ #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 1a88740a4ac08..ccdaad58c681a 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include "arrow/array.h" +#include "arrow/buffer-builder.h" #include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/ipc/test-common.h" @@ -40,7 +40,6 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" @@ -246,6 +245,23 @@ TEST_F(TestArray, BuildLargeInMemoryArray) { TEST_F(TestArray, TestCopy) {} +// ---------------------------------------------------------------------- +// Null type tests + +TEST(TestNullBuilder, Basics) { + NullBuilder builder; + std::shared_ptr array; + + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Append(nullptr)); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&array)); + + const auto& null_array = checked_cast(*array); + ASSERT_EQ(null_array.length(), 3); + ASSERT_EQ(null_array.null_count(), 3); +} + // ---------------------------------------------------------------------- // Primitive type tests @@ -746,22 +762,22 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesLazyIter) { auto& draws = this->draws_; auto& valid_bytes = this->valid_bytes_; - auto doubler = [&draws](int64_t index) { return draws[index] * 2; }; - auto lazy_iter = internal::MakeLazyRange(doubler, size); + auto halve = [&draws](int64_t index) { return draws[index] / 2; }; + auto lazy_iter = internal::MakeLazyRange(halve, size); ASSERT_OK(this->builder_->AppendValues(lazy_iter.begin(), lazy_iter.end(), valid_bytes.begin())); - std::vector doubled; - transform(draws.begin(), draws.end(), back_inserter(doubled), - [](T in) { return in * 2; }); + std::vector halved; + transform(draws.begin(), draws.end(), back_inserter(halved), + [](T in) { return in / 2; }); std::shared_ptr result; FinishAndCheckPadding(this->builder_.get(), &result); std::shared_ptr expected; ASSERT_OK( - this->builder_->AppendValues(doubled.data(), doubled.size(), valid_bytes.data())); + this->builder_->AppendValues(halved.data(), halved.size(), valid_bytes.data())); FinishAndCheckPadding(this->builder_.get(), &expected); ASSERT_TRUE(expected->Equals(result)); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 05d66d5cffdb2..1569889c0a6d0 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -18,6 +18,7 @@ #include "arrow/array.h" #include +#include #include #include #include @@ -32,6 +33,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/visitor.h" @@ -163,12 +165,6 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t le SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset)); } -template -NumericArray::NumericArray(const std::shared_ptr& data) - : PrimitiveArray(data) { - DCHECK_EQ(data->type->id(), T::type_id); -} - // ---------------------------------------------------------------------- // BooleanArray @@ -393,7 +389,7 @@ std::shared_ptr StructArray::field(int i) const { } std::shared_ptr StructArray::GetFieldByName(const std::string& name) const { - int i = struct_type()->GetChildIndex(name); + int i = struct_type()->GetFieldIndex(name); return i == -1 ? nullptr : field(i); } @@ -636,9 +632,8 @@ Status DictionaryArray::FromArrays(const std::shared_ptr& type, is_valid = ValidateDictionaryIndices(indices, upper_bound); break; default: - std::stringstream ss; - ss << "Categorical index type not supported: " << indices->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Categorical index type not supported: ", + indices->type()->ToString()); } if (!is_valid.ok()) { @@ -662,6 +657,66 @@ std::shared_ptr DictionaryArray::dictionary() const { return dict_type_->dictionary(); } +template +static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, + const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) { + using in_c_type = typename InType::c_type; + using out_c_type = typename OutType::c_type; + + std::shared_ptr out_buffer; + RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer)); + // Null bitmap is unchanged + auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer}, + in_data.null_count); + internal::TransposeInts(in_data.GetValues(1), + out_data->GetMutableValues(1), in_data.length, + transpose_map.data()); + *out = MakeArray(out_data); + return Status::OK(); +} + +Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const { + DCHECK_EQ(type->id(), Type::DICTIONARY); + const auto& out_dict_type = checked_cast(*type); + + // XXX We'll probably want to make this operation a kernel when we + // implement dictionary-to-dictionary casting. + auto in_type_id = dict_type_->index_type()->id(); + auto out_type_id = out_dict_type.index_type()->id(); + +#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ + case OUT_INDEX_TYPE::type_id: \ + return TransposeDictIndices(pool, *data(), type, \ + transpose_map, out); + +#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \ + case IN_INDEX_TYPE::type_id: \ + switch (out_type_id) { \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \ + default: \ + return Status::NotImplemented("unexpected index type"); \ + } + + switch (in_type_id) { + TRANSPOSE_IN_CASE(Int8Type) + TRANSPOSE_IN_CASE(Int16Type) + TRANSPOSE_IN_CASE(Int32Type) + TRANSPOSE_IN_CASE(Int64Type) + default: + return Status::NotImplemented("unexpected index type"); + } + +#undef TRANSPOSE_IN_OUT_CASE +#undef TRANSPOSE_IN_CASE +} + // ---------------------------------------------------------------------- // Implement Array::Accept as inline visitor @@ -678,12 +733,11 @@ struct ValidateVisitor { Status Visit(const NullArray&) { return Status::OK(); } Status Visit(const PrimitiveArray& array) { - if (array.data()->buffers.size() != 2) { - return Status::Invalid("number of buffers was != 2"); - } - if (array.values() == nullptr) { - return Status::Invalid("values was null"); - } + ARROW_RETURN_IF(array.data()->buffers.size() != 2, + Status::Invalid("number of buffers was != 2")); + + ARROW_RETURN_IF(array.values() == nullptr, Status::Invalid("values was null")); + return Status::OK(); } @@ -714,10 +768,8 @@ struct ValidateVisitor { return Status::Invalid("value_offsets_ was null"); } if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { - std::stringstream ss; - ss << "offset buffer size (bytes): " << value_offsets->size() - << " isn't large enough for length: " << array.length(); - return Status::Invalid(ss.str()); + return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), + " isn't large enough for length: ", array.length()); } if (!array.values()) { @@ -726,17 +778,13 @@ struct ValidateVisitor { const int32_t last_offset = array.value_offset(array.length()); if (array.values()->length() != last_offset) { - std::stringstream ss; - ss << "Final offset invariant not equal to values length: " << last_offset - << "!=" << array.values()->length(); - return Status::Invalid(ss.str()); + return Status::Invalid("Final offset invariant not equal to values length: ", + last_offset, "!=", array.values()->length()); } const Status child_valid = ValidateArray(*array.values()); if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Child array invalid: ", child_valid.ToString()); } int32_t prev_offset = array.value_offset(0); @@ -746,18 +794,14 @@ struct ValidateVisitor { for (int64_t i = 1; i <= array.length(); ++i) { int32_t current_offset = array.value_offset(i); if (array.IsNull(i - 1) && current_offset != prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure at: " << i - << " inconsistent value_offsets for null slot" << current_offset - << "!=" << prev_offset; - return Status::Invalid(ss.str()); + return Status::Invalid("Offset invariant failure at: ", i, + " inconsistent value_offsets for null slot", + current_offset, "!=", prev_offset); } if (current_offset < prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure: " << i - << " inconsistent offset for non-null slot: " << current_offset << "<" - << prev_offset; - return Status::Invalid(ss.str()); + return Status::Invalid("Offset invariant failure: ", i, + " inconsistent offset for non-null slot: ", current_offset, + "<", prev_offset); } prev_offset = current_offset; } @@ -780,18 +824,14 @@ struct ValidateVisitor { for (int i = 0; i < array.num_fields(); ++i) { auto it = array.field(i); if (it->length() != array_length) { - std::stringstream ss; - ss << "Length is not equal from field " << it->type()->ToString() - << " at position {" << idx << "}"; - return Status::Invalid(ss.str()); + return Status::Invalid("Length is not equal from field ", + it->type()->ToString(), " at position [", idx, "]"); } const Status child_valid = ValidateArray(*it); if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString() << " at position {" - << idx << "}"; - return Status::Invalid(ss.str()); + return Status::Invalid("Child array invalid: ", child_valid.ToString(), + " at position [", idx, "}"); } ++idx; } @@ -941,24 +981,4 @@ std::vector RechunkArraysConsistently( } // namespace internal -// ---------------------------------------------------------------------- -// Instantiate templates - -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; -template class ARROW_TEMPLATE_EXPORT NumericArray; - } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index b34b53933314f..5b4daa808c7e4 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -18,7 +18,6 @@ #ifndef ARROW_ARRAY_H #define ARROW_ARRAY_H -#include #include #include #include @@ -87,7 +86,7 @@ class Status; /// input array and replace them with newly-allocated data, changing the output /// data type as well. struct ARROW_EXPORT ArrayData { - ArrayData() : length(0) {} + ArrayData() : length(0), null_count(0), offset(0) {} ArrayData(const std::shared_ptr& type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0) @@ -311,7 +310,7 @@ class ARROW_EXPORT Array { std::string ToString() const; protected: - Array() {} + Array() : null_bitmap_data_(NULLPTR) {} std::shared_ptr data_; const uint8_t* null_bitmap_data_; @@ -382,7 +381,7 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { std::shared_ptr values() const { return data_->buffers[1]; } protected: - PrimitiveArray() {} + PrimitiveArray() : raw_values_(NULLPTR) {} inline void SetData(const std::shared_ptr& data) { auto values = data->buffers[1]; @@ -399,12 +398,12 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { /// Concrete Array class for numeric data. template -class ARROW_EXPORT NumericArray : public PrimitiveArray { +class NumericArray : public PrimitiveArray { public: using TypeClass = TYPE; using value_type = typename TypeClass::c_type; - explicit NumericArray(const std::shared_ptr& data); + explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} // Only enable this constructor without a type argument for types without additional // metadata @@ -423,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int64_t i) const { return raw_values()[i]; } + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -443,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { i + data_->offset); } + bool GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -565,7 +569,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray { protected: // For subclasses - BinaryArray() {} + BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} /// Protected method for constructors void SetData(const std::shared_ptr& data); @@ -803,7 +807,7 @@ class ARROW_EXPORT DictionaryArray : public Array { /// This function does the validation of the indices and input type. It checks if /// all indices are non-negative and smaller than the size of the dictionary /// - /// \param[in] type a data type containing a dictionary + /// \param[in] type a dictionary type /// \param[in] indices an array of non-negative signed /// integers smaller than the size of the dictionary /// \param[out] out the resulting DictionaryArray instance @@ -811,6 +815,23 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& indices, std::shared_ptr* out); + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary type, + /// transposing indices using the transpose map. + /// The type and the transpose map are typically computed using + /// DictionaryType::Unify. + /// + /// \param[in] pool a pool to allocate the array data from + /// \param[in] type a dictionary type + /// \param[in] transpose_map a vector transposing this array's indices + /// into the target array's indices + /// \param[out] out the resulting DictionaryArray instance + Status Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const; + // XXX Do we also want an unsafe in-place Transpose? + std::shared_ptr indices() const; std::shared_ptr dictionary() const; @@ -823,27 +844,6 @@ class ARROW_EXPORT DictionaryArray : public Array { std::shared_ptr indices_; }; -// ---------------------------------------------------------------------- -// extern templates and other details - -// Only instantiate these templates once -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; -ARROW_EXTERN_TEMPLATE NumericArray; - /// \brief Perform any validation checks to determine obvious inconsistencies /// with the array's internal data /// diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt new file mode 100644 index 0000000000000..4a8ce3490abd1 --- /dev/null +++ b/cpp/src/arrow/array/CMakeLists.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Headers: top level +ARROW_INSTALL_ALL_HEADERS("arrow/array") diff --git a/cpp/doc/Parquet.md b/cpp/src/arrow/array/README.md similarity index 62% rename from cpp/doc/Parquet.md rename to cpp/src/arrow/array/README.md index 0ed100731ca1a..09580193aad28 100644 --- a/cpp/doc/Parquet.md +++ b/cpp/src/arrow/array/README.md @@ -17,21 +17,4 @@ under the License. --> -## Building Arrow-Parquet integration - -To use Arrow C++ with Parquet, you must first build the Arrow C++ libraries and -install them someplace. Then, you can build [parquet-cpp][1] with the Arrow -adapter library: - -```bash -# Set this to your preferred install location -export ARROW_HOME=$HOME/local - -git clone https://github.com/apache/parquet-cpp.git -cd parquet-cpp -cmake -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME -make -j4 -make install -``` - -[1]: https://github.com/apache/parquet-cpp +## Implementation details related to columnnar (array) data structures diff --git a/cpp/src/arrow/builder-adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc similarity index 96% rename from cpp/src/arrow/builder-adaptive.cc rename to cpp/src/arrow/array/builder_adaptive.cc index a715f469c7aa1..e96c9a2400833 100644 --- a/cpp/src/arrow/builder-adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_adaptive.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -88,12 +90,13 @@ Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); } - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); - *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); + *out = ArrayData::Make(output_type, length_, {null_bitmap, data_}, null_count_); - data_ = null_bitmap_ = nullptr; + data_ = nullptr; capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -273,12 +276,13 @@ Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr* out) { return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); } - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); - *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); + *out = ArrayData::Make(output_type, length_, {null_bitmap, data_}, null_count_); - data_ = null_bitmap_ = nullptr; + data_ = nullptr; capacity_ = length_ = null_count_ = 0; return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h new file mode 100644 index 0000000000000..6523de41622e4 --- /dev/null +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" + +namespace arrow { + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + explicit AdaptiveIntBuilderBase(MemoryPool* pool); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + Status AppendNull() { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + virtual Status CommitPendingData() = 0; + + std::shared_ptr data_; + uint8_t* raw_data_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_; + bool pending_has_nulls_; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { + auto v = static_cast(val); + + pending_data_[pending_pos_] = v; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc new file mode 100644 index 0000000000000..e8059007c34ce --- /dev/null +++ b/cpp/src/arrow/array/builder_base.cc @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_base.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { + if (buffer) { + if (bytes_filled < buffer->size()) { + // Trim buffer + RETURN_NOT_OK(buffer->Resize(bytes_filled)); + } + // zero the padding + buffer->ZeroPadding(); + } else { + // Null buffers are allowed in place of 0-byte buffers + DCHECK_EQ(bytes_filled, 0); + } + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(bool is_valid) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status ArrayBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity_ = capacity; + return null_bitmap_builder_.Resize(capacity); +} + +Status ArrayBuilder::Advance(int64_t elements) { + if (length_ + elements > capacity_) { + return Status::Invalid("Builder must be expanded"); + } + length_ += elements; + return null_bitmap_builder_.Advance(elements); +} + +Status ArrayBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr internal_data; + RETURN_NOT_OK(FinishInternal(&internal_data)); + *out = MakeArray(internal_data); + return Status::OK(); +} + +void ArrayBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_builder_.Reset(); +} + +Status ArrayBuilder::SetNotNull(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeSetNotNull(length); + return Status::OK(); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { + for (bool element_valid : is_valid) { + UnsafeAppendToBitmap(element_valid); + } +} + +void ArrayBuilder::UnsafeSetNotNull(int64_t length) { + length_ += length; + null_bitmap_builder_.UnsafeAppend(length, true); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h new file mode 100644 index 0000000000000..f4655fab0dea5 --- /dev/null +++ b/cpp/src/arrow/array/builder_base.h @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer-builder.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +struct ArrayData; + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), pool_(pool), null_bitmap_builder_(pool) {} + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + int num_children() const { return static_cast(children_.size()); } + + int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to add the indicated + /// number of elements without any further calls to Resize. The memory + /// allocated is rounded up to the next highest power of 2 similar to memory + /// allocations in STL containers like std::vector + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity) { + auto min_capacity = length() + additional_capacity; + if (min_capacity <= capacity()) return Status::OK(); + + // leave growth factor up to BufferBuilder + auto new_capacity = BufferBuilder::GrowByFactor(min_capacity); + return Resize(new_capacity); + } + + /// Reset the builder. + virtual void Reset(); + + /// For cases where raw data was memcpy'd into the internal buffers, allows us + /// to advance the length of the builder. It is your responsibility to use + /// this function responsibly. + Status Advance(int64_t elements); + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + std::shared_ptr type() const { return type_; } + + protected: + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + null_bitmap_builder_.UnsafeAppend(is_valid); + ++length_; + if (!is_valid) ++null_count_; + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(valid_bytes, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next length bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { + if (new_capacity < 0) { + return Status::Invalid("Resize capacity must be positive"); + } + if (new_capacity < old_capacity) { + return Status::Invalid("Resize cannot downsize"); + } + return Status::OK(); + } + + std::shared_ptr type_; + MemoryPool* pool_; + + TypedBufferBuilder null_bitmap_builder_; + int64_t null_count_ = 0; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_ = 0; + int64_t capacity_ = 0; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-binary.cc b/cpp/src/arrow/array/builder_binary.cc similarity index 72% rename from cpp/src/arrow/builder-binary.cc rename to cpp/src/arrow/array/builder_binary.cc index c250837b4a3fa..4fef135b20348 100644 --- a/cpp/src/arrow/builder-binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_binary.h" + #include #include #include @@ -27,7 +29,6 @@ #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -53,47 +54,23 @@ Status BinaryBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); + RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); return ArrayBuilder::Resize(capacity); } Status BinaryBuilder::ReserveData(int64_t elements) { - if (value_data_length() + elements > value_data_capacity()) { - if (value_data_length() + elements > kBinaryMemoryLimit) { - return Status::CapacityError( - "Cannot reserve capacity larger than 2^31 - 1 for binary"); - } - RETURN_NOT_OK(value_data_builder_.Reserve(elements)); - } - return Status::OK(); -} + const int64_t size = value_data_length() + elements; + ARROW_RETURN_IF( + size > kBinaryMemoryLimit, + Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary")); -Status BinaryBuilder::AppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_bytes)); + return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) + : Status::OK(); } -Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { - RETURN_NOT_OK(Reserve(1)); - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append(value, length)); - - UnsafeAppendToBitmap(true); - return Status::OK(); -} - -Status BinaryBuilder::AppendNull() { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(Reserve(1)); - - UnsafeAppendToBitmap(false); - return Status::OK(); +Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { + return Status::CapacityError("BinaryArray cannot contain more than ", + kBinaryMemoryLimit, " bytes, have ", num_bytes); } Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { @@ -101,12 +78,13 @@ Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(AppendNextOffset()); // These buffers' padding zeroed by BufferBuilder - std::shared_ptr offsets, value_data; + std::shared_ptr offsets, value_data, null_bitmap; RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets, value_data}, null_count_, - 0); + *out = + ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, null_count_, 0); Reset(); return Status::OK(); } @@ -154,17 +132,17 @@ Status StringBuilder::AppendValues(const std::vector& values, if (valid_bytes) { for (std::size_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(AppendNextOffset()); + UnsafeAppendNextOffset(); if (valid_bytes[i]) { - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i].data()), values[i].size())); + value_data_builder_.UnsafeAppend( + reinterpret_cast(values[i].data()), values[i].size()); } } } else { for (std::size_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i].data()), values[i].size())); + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i].data()), + values[i].size()); } } @@ -193,11 +171,11 @@ Status StringBuilder::AppendValues(const char** values, int64_t length, if (valid_bytes) { int64_t valid_bytes_offset = 0; for (int64_t i = 0; i < length; ++i) { - RETURN_NOT_OK(AppendNextOffset()); + UnsafeAppendNextOffset(); if (valid_bytes[i]) { if (values[i]) { - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i]), value_lengths[i])); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); } else { UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); UnsafeAppendToBitmap(false); @@ -210,19 +188,19 @@ Status StringBuilder::AppendValues(const char** values, int64_t length, if (have_null_value) { std::vector valid_vector(length, 0); for (int64_t i = 0; i < length; ++i) { - RETURN_NOT_OK(AppendNextOffset()); + UnsafeAppendNextOffset(); if (values[i]) { - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i]), value_lengths[i])); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); valid_vector[i] = 1; } } UnsafeAppendToBitmap(valid_vector.data(), length); } else { for (int64_t i = 0; i < length; ++i) { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i]), value_lengths[i])); + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(reinterpret_cast(values[i]), + value_lengths[i]); } UnsafeAppendToBitmap(nullptr, length); } @@ -273,9 +251,10 @@ Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(byte_builder_.Finish(&data)); - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); - null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -292,24 +271,46 @@ util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { } // ---------------------------------------------------------------------- -// Decimal128Builder +// ChunkedArray builders + +namespace internal { -Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool) {} +ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_size, MemoryPool* pool) + : max_chunk_size_(max_chunk_size), + chunk_data_size_(0), + builder_(new BinaryBuilder(pool)) {} -Status Decimal128Builder::Append(const Decimal128& value) { - RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); - return FixedSizeBinaryBuilder::Append(value.ToBytes()); +Status ChunkedBinaryBuilder::Finish(ArrayVector* out) { + if (builder_->length() > 0 || chunks_.size() == 0) { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); + } + *out = std::move(chunks_); + return Status::OK(); } -Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr data; - RETURN_NOT_OK(byte_builder_.Finish(&data)); +Status ChunkedBinaryBuilder::NextChunk() { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); + + chunk_data_size_ = 0; + return Status::OK(); +} - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); +Status ChunkedStringBuilder::Finish(ArrayVector* out) { + RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); + // Change data type to string/utf8 + for (size_t i = 0; i < out->size(); ++i) { + std::shared_ptr data = (*out)[i]->data(); + data->type = ::arrow::utf8(); + (*out)[i] = std::make_shared(data); + } return Status::OK(); } +} // namespace internal + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h new file mode 100644 index 0000000000000..abd8387f8094c --- /dev/null +++ b/cpp/src/arrow/array/builder_binary.h @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + +// ---------------------------------------------------------------------- +// Binary and String + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { + public: + explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + Status Append(const uint8_t* value, int32_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(util::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, int32_t length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int32_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendOverflow(int64_t num_bytes); + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + return AppendOverflow(num_bytes); + } + return offsets_builder_.Append(static_cast(num_bytes)); + } + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using BinaryBuilder::Append; + using BinaryBuilder::Reset; + using BinaryBuilder::UnsafeAppend; + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value, byte_width_); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(const util::string_view& view) { +#ifndef NDEBUG + CheckValueSize(static_cast(view.size())); +#endif + return Append(reinterpret_cast(view.data())); + } + + Status Append(const std::string& s) { +#ifndef NDEBUG + CheckValueSize(static_cast(s.size())); +#endif + return Append(reinterpret_cast(s.data())); + } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + Status AppendNull(); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + +#ifndef NDEBUG + void CheckValueSize(int64_t size); +#endif +}; + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + ChunkedBinaryBuilder(int32_t max_chunk_size, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { + // Move onto next chunk, unless the builder length is currently 0, which + // means that max_chunk_size_ is less than the item length + if (builder_->length() > 0) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + // else fall through + } + + chunk_data_size_ += length; + return builder_->Append(value, length); + } + + Status Append(const util::string_view& value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits::max())) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + Status Reserve(int64_t values) { return builder_->Reserve(values); } + + virtual Status Finish(ArrayVector* out); + + protected: + Status NextChunk(); + + int32_t max_chunk_size_; + int32_t chunk_data_size_; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc new file mode 100644 index 0000000000000..191a0ff014078 --- /dev/null +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_decimal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Decimal128Builder + +Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool) {} + +Status Decimal128Builder::Append(const Decimal128& value) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(value.ToBytes()); +} + +Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + + *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h new file mode 100644 index 0000000000000..fb40a7950abbd --- /dev/null +++ b/cpp/src/arrow/array/builder_decimal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" + +namespace arrow { + +class Decimal128; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal128& val); + + Status FinishInternal(std::shared_ptr* out) override; +}; + +using DecimalBuilder = Decimal128Builder; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-dict.cc b/cpp/src/arrow/array/builder_dict.cc similarity index 57% rename from cpp/src/arrow/builder-dict.cc rename to cpp/src/arrow/array/builder_dict.cc index b021c3a9d37cc..cfc3d3d4b1a05 100644 --- a/cpp/src/arrow/builder-dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -15,24 +15,135 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_dict.h" + +#include #include +#include +#include +#include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// DictionaryType unification + +struct UnifyDictionaryValues { + MemoryPool* pool_; + std::shared_ptr value_type_; + const std::vector& types_; + std::shared_ptr* out_values_; + std::vector>* out_transpose_maps_; + + Status Visit(const DataType&, void* = nullptr) { + // Default implementation for non-dictionary-supported datatypes + std::stringstream ss; + ss << "Unification of " << value_type_->ToString() + << " dictionaries is not implemented"; + return Status::NotImplemented(ss.str()); + } + + template + Status Visit(const T&, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + using ArrayType = typename TypeTraits::ArrayType; + using DictTraits = typename internal::DictionaryTraits; + using MemoTableType = typename DictTraits::MemoTableType; + + MemoTableType memo_table; + if (out_transpose_maps_ != nullptr) { + out_transpose_maps_->clear(); + out_transpose_maps_->reserve(types_.size()); + } + // Build up the unified dictionary values and the transpose maps + for (const auto& type : types_) { + const ArrayType& values = checked_cast(*type->dictionary()); + if (out_transpose_maps_ != nullptr) { + std::vector transpose_map; + transpose_map.reserve(values.length()); + for (int64_t i = 0; i < values.length(); ++i) { + int32_t dict_index = memo_table.GetOrInsert(values.GetView(i)); + transpose_map.push_back(dict_index); + } + out_transpose_maps_->push_back(std::move(transpose_map)); + } else { + for (int64_t i = 0; i < values.length(); ++i) { + memo_table.GetOrInsert(values.GetView(i)); + } + } + } + // Build unified dictionary array + std::shared_ptr data; + RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table, + 0 /* start_offset */, &data)); + *out_values_ = MakeArray(data); + return Status::OK(); + } +}; + +Status DictionaryType::Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps) { + if (types.size() == 0) { + return Status::Invalid("need at least one input type"); + } + std::vector dict_types; + dict_types.reserve(types.size()); + for (const auto& type : types) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("input types must be dictionary types"); + } + dict_types.push_back(checked_cast(type)); + } + + // XXX Should we check the ordered flag? + auto value_type = dict_types[0]->dictionary()->type(); + for (const auto& type : dict_types) { + auto values = type->dictionary(); + if (!values->type()->Equals(value_type)) { + return Status::TypeError("input types have different value types"); + } + if (values->null_count() != 0) { + return Status::TypeError("input types have null values"); + } + } + + std::shared_ptr values; + { + UnifyDictionaryValues visitor{pool, value_type, dict_types, &values, + out_transpose_maps}; + RETURN_NOT_OK(VisitTypeInline(*value_type, &visitor)); + } + + // Build unified dictionary type with the right index type + std::shared_ptr index_type; + if (values->length() <= std::numeric_limits::max()) { + index_type = int8(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int16(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int32(); + } else { + index_type = int64(); + } + *out_type = arrow::dictionary(index_type, values); + return Status::OK(); +} + // ---------------------------------------------------------------------- // DictionaryBuilder @@ -50,7 +161,11 @@ DictionaryBuilder::~DictionaryBuilder() {} template DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), byte_width_(-1), values_builder_(pool) { + : ArrayBuilder(type, pool), + memo_table_(new MemoTableImpl(0)), + delta_offset_(0), + byte_width_(-1), + values_builder_(pool) { DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder"; } @@ -64,13 +179,15 @@ template <> DictionaryBuilder::DictionaryBuilder( const std::shared_ptr& type, MemoryPool* pool) : ArrayBuilder(type, pool), + memo_table_(new MemoTableImpl(0)), + delta_offset_(0), byte_width_(checked_cast(*type).byte_width()) {} template void DictionaryBuilder::Reset() { ArrayBuilder::Reset(); values_builder_.Reset(); - memo_table_.reset(); + memo_table_.reset(new MemoTableImpl(0)); delta_offset_ = 0; } @@ -82,7 +199,6 @@ Status DictionaryBuilder::Resize(int64_t capacity) { if (capacity_ == 0) { // Initialize hash table // XXX should we let the user pass additional size heuristics? - memo_table_.reset(new MemoTableImpl(0)); delta_offset_ = 0; } RETURN_NOT_OK(values_builder_.Resize(capacity)); @@ -103,25 +219,53 @@ Status DictionaryBuilder::Append(const Scalar& value) { auto memo_index = memo_table_->GetOrInsert(value); RETURN_NOT_OK(values_builder_.Append(memo_index)); + length_ += 1; return Status::OK(); } template Status DictionaryBuilder::AppendNull() { + length_ += 1; + null_count_ += 1; + return values_builder_.AppendNull(); } -Status DictionaryBuilder::AppendNull() { return values_builder_.AppendNull(); } +Status DictionaryBuilder::AppendNull() { + length_ += 1; + null_count_ += 1; + + return values_builder_.AppendNull(); +} template Status DictionaryBuilder::AppendArray(const Array& array) { - const auto& numeric_array = checked_cast&>(array); + using ArrayType = typename TypeTraits::ArrayType; + + const auto& concrete_array = checked_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); } else { - RETURN_NOT_OK(Append(numeric_array.Value(i))); + RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); +} + +template <> +Status DictionaryBuilder::AppendArray(const Array& array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& typed_array = checked_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(typed_array.GetValue(i))); } } return Status::OK(); @@ -166,46 +310,6 @@ Status DictionaryBuilder::FinishInternal(std::shared_ptr* o return Status::OK(); } -// -// StringType and BinaryType specializations -// - -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - \ - template <> \ - Status DictionaryBuilder::AppendArray(const Array& array) { \ - using ArrayType = typename TypeTraits::ArrayType; \ - const ArrayType& binary_array = checked_cast(array); \ - for (int64_t i = 0; i < array.length(); i++) { \ - if (array.IsNull(i)) { \ - RETURN_NOT_OK(AppendNull()); \ - } else { \ - RETURN_NOT_OK(Append(binary_array.GetView(i))); \ - } \ - } \ - return Status::OK(); \ - } - -BINARY_DICTIONARY_SPECIALIZATIONS(StringType); -BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType); - -template <> -Status DictionaryBuilder::AppendArray(const Array& array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& typed_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(typed_array.GetValue(i))); - } - } - return Status::OK(); -} - template class DictionaryBuilder; template class DictionaryBuilder; template class DictionaryBuilder; diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h new file mode 100644 index 0000000000000..6f0271683aea2 --- /dev/null +++ b/cpp/src/arrow/array/builder_dict.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export + +namespace arrow { + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryScalar { + using type = typename T::c_type; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +} // namespace internal + +/// \brief Array builder for created encoded DictionaryArray from dense array +/// +/// Unlike other builders, dictionary builder does not completely reset the state +/// on Finish calls. The arrays built after the initial Finish call will reuse +/// the previously created encoding and build a delta dictionary when new terms +/// occur. +/// +/// data +template +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + using Scalar = typename internal::DictionaryScalar::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + template + explicit DictionaryBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} + + ~DictionaryBuilder() override; + + /// \brief Append a scalar value + Status Append(const Scalar& value); + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const uint8_t*>::type value) { + return Append(util::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const char*>::type value) { + return Append(util::string_view(value, byte_width_)); + } + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// is the dictionary builder in the delta building mode + bool is_building_delta() { return delta_offset_ > 0; } + + protected: + class MemoTableImpl; + std::unique_ptr memo_table_; + + int32_t delta_offset_; + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + AdaptiveIntBuilder values_builder_; +}; + +template <> +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + explicit DictionaryBuilder(MemoryPool* pool); + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + protected: + AdaptiveIntBuilder values_builder_; +}; + +class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +/// \brief Dictionary array builder with convenience methods for strings +class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc new file mode 100644 index 0000000000000..2f600cd9b9228 --- /dev/null +++ b/cpp/src/arrow/array/builder_nested.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_nested.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// ListBuilder + +ListBuilder::ListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type) + : ArrayBuilder(type ? type + : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), + offsets_builder_(pool), + value_builder_(value_builder) {} + +Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); +} + +Status ListBuilder::AppendNextOffset() { + const int64_t num_values = value_builder_->length(); + ARROW_RETURN_IF( + num_values > kListMaximumElements, + Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,", + " have ", num_values)); + return offsets_builder_.Append(static_cast(num_values)); +} + +Status ListBuilder::Append(bool is_valid) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return AppendNextOffset(); +} + +Status ListBuilder::Resize(int64_t capacity) { + DCHECK_LE(capacity, kListMaximumElements); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + + std::shared_ptr items; + if (values_) { + items = values_->data(); + } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + RETURN_NOT_OK(value_builder_->Resize(0)); + } + RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + } + + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_); + (*out)->child_data.emplace_back(std::move(items)); + Reset(); + return Status::OK(); +} + +void ListBuilder::Reset() { + ArrayBuilder::Reset(); + values_.reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); +} + +ArrayBuilder* ListBuilder::value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_.get(); +} + +// ---------------------------------------------------------------------- +// Struct + +StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders) + : ArrayBuilder(type, pool) { + children_ = std::move(field_builders); +} + +void StructBuilder::Reset() { + ArrayBuilder::Reset(); + for (const auto& field_builder : children_) { + field_builder->Reset(); + } +} + +Status StructBuilder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = ArrayData::Make(type_, length_, {null_bitmap}, null_count_); + + (*out)->child_data.resize(children_.size()); + for (size_t i = 0; i < children_.size(); ++i) { + if (length_ == 0) { + // Try to make sure the child buffers are initialized + RETURN_NOT_OK(children_[i]->Resize(0)); + } + RETURN_NOT_OK(children_[i]->FinishInternal(&(*out)->child_data[i])); + } + + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h new file mode 100644 index 0000000000000..19b0ad81b5a16 --- /dev/null +++ b/cpp/src/arrow/array/builder_nested.h @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// List builder + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true); + + Status AppendNull() { return Append(false); } + + ArrayBuilder* value_builder() const; + + protected: + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr values_; + + Status AppendNextOffset(); +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { return Append(false); } + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } + + int num_fields() const { return static_cast(children_.size()); } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc new file mode 100644 index 0000000000000..a593f362dd29a --- /dev/null +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_primitive.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::FinishInternal(std::shared_ptr* out) { + *out = ArrayData::Make(null(), length_, {nullptr}, length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + +template +void PrimitiveBuilder::Reset() { + data_.reset(); + raw_data_ = nullptr; +} + +template +Status PrimitiveBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + int64_t nbytes = TypeTraits::bytes_required(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); + } else { + RETURN_NOT_OK(data_->Resize(nbytes)); + } + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return ArrayBuilder::Resize(capacity); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +template +Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = ArrayData::Make(type_, length_, {null_bitmap, data_}, null_count_); + + data_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +BooleanBuilder::BooleanBuilder(MemoryPool* pool) + : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} + +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BooleanBuilder(pool) { + DCHECK_EQ(Type::BOOL, type->id()); +} + +void BooleanBuilder::Reset() { + ArrayBuilder::Reset(); + data_.reset(); + raw_data_ = nullptr; +} + +Status BooleanBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // We zero the memory for booleans to keep things simple; for some reason if + // we do not, even though we may write every bit (through in-place | or &), + // valgrind will still show a warning. If we do not zero the bytes here, we + // will have to be careful to zero them in AppendNull and AppendNulls. Also, + // zeroing the bits results in deterministic bits when each byte may have a + // mix of nulls and not nulls. + // + // We only zero up to new_bitmap_size because the padding was zeroed by + // AllocateResizableBuffer + memset(raw_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = data_->capacity(); + RETURN_NOT_OK(data_->Resize(new_bitmap_size)); + const int64_t new_bitmap_capacity = data_->capacity(); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // See comment above about why we zero memory for booleans + memset(raw_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + + return ArrayBuilder::Resize(capacity); +} + +Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { + int64_t bit_offset = length_ % 8; + if (bit_offset > 0) { + // Adjust last byte + data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; + } + + std::shared_ptr null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); + + *out = ArrayData::Make(boolean(), length_, {null_bitmap, data_}, null_count_); + + data_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++] != 0; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeSetNotNull(length); + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h new file mode 100644 index 0000000000000..d17a13013ceae --- /dev/null +++ b/cpp/src/arrow/array/builder_primitive.h @@ -0,0 +1,412 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/type.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Append(std::nullptr_t value) { return AppendNull(); } + + Status FinishInternal(std::shared_ptr* out) override; +}; + +template +class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { + public: + using value_type = typename Type::c_type; + + explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} + + using ArrayBuilder::Advance; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent + /// uninitialized memory access + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(raw_data_ + length_, 0, + static_cast(TypeTraits::bytes_required(length))); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + memset(raw_data_ + length_, 0, sizeof(value_type)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { + return reinterpret_cast(data_->data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + for (int64_t i = 0; i != length; ++i) { + UnsafeAppendToBitmap(*valid_begin); + ++valid_begin; + } + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + for (int64_t i = 0; i != length; ++i) { + UnsafeAppendToBitmap(*valid_begin); + ++valid_begin; + } + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; +}; + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { + public: + using typename PrimitiveBuilder::value_type; + using PrimitiveBuilder::PrimitiveBuilder; + + template + explicit NumericBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool + ARROW_MEMORY_POOL_DEFAULT) + : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} + + using ArrayBuilder::UnsafeAppendNull; + using ArrayBuilder::UnsafeAppendToBitmap; + using PrimitiveBuilder::AppendValues; + using PrimitiveBuilder::Resize; + using PrimitiveBuilder::Reserve; + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + raw_data_[length_] = val; + UnsafeAppendToBitmap(true); + } + + protected: + using PrimitiveBuilder::length_; + using PrimitiveBuilder::raw_data_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; +using Time32Builder = NumericBuilder; +using Time64Builder = NumericBuilder; +using Date32Builder = NumericBuilder; +using Date64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { + public: + using value_type = bool; + explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + + using ArrayBuilder::Advance; + using ArrayBuilder::UnsafeAppendNull; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + if (val) { + BitUtil::SetBit(raw_data_, length_); + } else { + BitUtil::ClearBit(raw_data_, length_); + } + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + for (int64_t i = 0; i != length; ++i) { + ArrayBuilder::UnsafeAppendToBitmap(*valid_begin); + ++valid_begin; + } + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + for (int64_t i = 0; i != length; ++i) { + ArrayBuilder::UnsafeAppendToBitmap(*valid_begin); + ++valid_begin; + } + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + uint8_t* raw_data_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h new file mode 100644 index 0000000000000..9344d5d92b715 --- /dev/null +++ b/cpp/src/arrow/buffer-builder.h @@ -0,0 +1,348 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_BUFFER_BUILDER_H +#define ARROW_BUFFER_BUILDER_H + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer builder classes + +/// \class BufferBuilder +/// \brief A class for incrementally building a contiguous chunk of in-memory +/// data +class ARROW_EXPORT BufferBuilder { + public: + explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} + + /// \brief Resize the buffer to the nearest multiple of 64 bytes + /// + /// \param new_capacity the new capacity of the of the builder. Will be + /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if + /// new capacity is smaller than the existing size, reallocate internal + /// buffer. Set to false to avoid reallocations when shrinking the builder. + /// \return Status + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + // Resize(0) is a no-op + if (new_capacity == 0) { + return Status::OK(); + } + int64_t old_capacity = capacity_; + + if (buffer_ == NULLPTR) { + ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity, &buffer_)); + } else { + ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); + } + capacity_ = buffer_->capacity(); + data_ = buffer_->mutable_data(); + if (capacity_ > old_capacity) { + memset(data_ + old_capacity, 0, capacity_ - old_capacity); + } + return Status::OK(); + } + + /// \brief Ensure that builder can accommodate the additional number of bytes + /// without the need to perform allocations + /// + /// \param[in] additional_bytes number of additional bytes to make space for + /// \param[in] grow_by_factor if true, round up allocations using the + /// strategy in BufferBuilder::GrowByFactor + /// \return Status + Status Reserve(const int64_t additional_bytes, bool grow_by_factor = false) { + auto min_capacity = size_ + additional_bytes; + if (min_capacity <= capacity_) return Status::OK(); + if (grow_by_factor) { + min_capacity = GrowByFactor(min_capacity); + } + return Resize(min_capacity, false); + } + + /// \brief Return a capacity expanded by a growth factor of 2 + static int64_t GrowByFactor(const int64_t min_capacity) { + // If the capacity was not already a multiple of 2, do so here + // TODO(emkornfield) doubling isn't great default allocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // for discussion + return BitUtil::NextPower2(min_capacity); + } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const void* data, const int64_t length) { + if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) { + ARROW_RETURN_NOT_OK(Resize(GrowByFactor(size_ + length), false)); + } + UnsafeAppend(data, length); + return Status::OK(); + } + + /// \brief Append copies of a value to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const int64_t num_copies, uint8_t value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies, true)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + template + Status Append(const std::array& data) { + constexpr auto nbytes = static_cast(NBYTES); + ARROW_RETURN_NOT_OK(Reserve(NBYTES, true)); + std::copy(data.cbegin(), data.cend(), data_ + size_); + size_ += nbytes; + return Status::OK(); + } + + // Advance pointer and zero out memory + Status Advance(const int64_t length) { return Append(length, 0); } + + // Unsafe methods don't check existing size + void UnsafeAppend(const void* data, const int64_t length) { + memcpy(data_ + size_, data, static_cast(length)); + size_ += length; + } + + void UnsafeAppend(const int64_t num_copies, uint8_t value) { + memset(data_ + size_, value, static_cast(num_copies)); + size_ += num_copies; + } + + /// \brief Return result of builder as a Buffer object. + /// + /// The builder is reset and can be reused afterwards. + /// + /// \param[out] out the finalized Buffer object + /// \param shrink_to_fit if the buffer size is smaller than its capacity, + /// reallocate to fit more tightly in memory. Set to false to avoid + /// a reallocation, at the expense of potentially more memory consumption. + /// \return Status + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); + *out = buffer_; + Reset(); + return Status::OK(); + } + + void Reset() { + buffer_ = NULLPTR; + capacity_ = size_ = 0; + } + + int64_t capacity() const { return capacity_; } + int64_t length() const { return size_; } + const uint8_t* data() const { return data_; } + uint8_t* mutable_data() { return data_; } + + private: + std::shared_ptr buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; +}; + +template +class TypedBufferBuilder; + +/// \brief A BufferBuilder for building a buffer of arithmetic elements +template +class TypedBufferBuilder::value>::type> { + public: + explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : bytes_builder_(pool) {} + + Status Append(T value) { + return bytes_builder_.Append(reinterpret_cast(&value), sizeof(T)); + } + + Status Append(const T* values, int64_t num_elements) { + return bytes_builder_.Append(reinterpret_cast(values), + num_elements * sizeof(T)); + } + + Status Append(const int64_t num_copies, T value) { + ARROW_RETURN_NOT_OK(Resize(GrowByFactor(num_copies + length()), false)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + void UnsafeAppend(T value) { + bytes_builder_.UnsafeAppend(reinterpret_cast(&value), sizeof(T)); + } + + void UnsafeAppend(const T* values, int64_t num_elements) { + bytes_builder_.UnsafeAppend(reinterpret_cast(values), + num_elements * sizeof(T)); + } + + void UnsafeAppend(const int64_t num_copies, T value) { + auto data = mutable_data() + length(); + bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); + for (const auto end = data + num_copies; data != end; ++data) { + *data = value; + } + } + + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit); + } + + Status Reserve(const int64_t additional_elements) { + return bytes_builder_.Reserve(additional_elements * sizeof(T)); + } + + Status Advance(const int64_t length) { + return bytes_builder_.Advance(length * sizeof(T)); + } + + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + return bytes_builder_.Finish(out, shrink_to_fit); + } + + void Reset() { bytes_builder_.Reset(); } + + int64_t length() const { return bytes_builder_.length() / sizeof(T); } + int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); } + const T* data() const { return reinterpret_cast(bytes_builder_.data()); } + T* mutable_data() { return reinterpret_cast(bytes_builder_.mutable_data()); } + + private: + BufferBuilder bytes_builder_; +}; + +/// \brief A BufferBuilder for building a buffer containing a bitmap +template <> +class TypedBufferBuilder { + public: + explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : bytes_builder_(pool) {} + + Status Append(bool value) { + ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + 1)); + UnsafeAppend(value); + return Status::OK(); + } + + Status Append(const uint8_t* valid_bytes, int64_t num_elements) { + ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_elements)); + UnsafeAppend(valid_bytes, num_elements); + return Status::OK(); + } + + Status Append(const int64_t num_copies, bool value) { + ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_copies)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + void UnsafeAppend(bool value) { + BitUtil::SetBitTo(mutable_data(), bit_length_, value); + if (!value) { + ++false_count_; + } + ++bit_length_; + } + + void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { + if (num_elements == 0) return; + int64_t i = 0; + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = bytes[i++]; + if (!value) ++false_count_; + return value; + }); + bit_length_ += num_elements; + } + + void UnsafeAppend(const int64_t num_copies, bool value) { + BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); + if (!value) { + false_count_ += num_copies; + } + bit_length_ += num_copies; + } + + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + const int64_t old_byte_capacity = bytes_builder_.capacity(); + const int64_t new_byte_capacity = BitUtil::BytesForBits(new_capacity); + ARROW_RETURN_NOT_OK(bytes_builder_.Resize(new_byte_capacity, shrink_to_fit)); + if (new_byte_capacity > old_byte_capacity) { + memset(mutable_data() + old_byte_capacity, 0, + static_cast(new_byte_capacity - old_byte_capacity)); + } + return Status::OK(); + } + + Status Reserve(const int64_t additional_elements) { + return Resize(bit_length_ + additional_elements, false); + } + + Status Advance(const int64_t length) { + bit_length_ += length; + false_count_ += length; + return ResizeWithGrowthFactor(bit_length_); + } + + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + bit_length_ = false_count_ = 0; + return bytes_builder_.Finish(out, shrink_to_fit); + } + + void Reset() { + bytes_builder_.Reset(); + bit_length_ = false_count_ = 0; + } + + int64_t length() const { return bit_length_; } + int64_t capacity() const { return bytes_builder_.capacity() * 8; } + const uint8_t* data() const { return bytes_builder_.data(); } + uint8_t* mutable_data() { return bytes_builder_.mutable_data(); } + int64_t false_count() const { return false_count_; } + + private: + Status ResizeWithGrowthFactor(const int64_t min_capacity) { + return Resize(BufferBuilder::GrowByFactor(min_capacity), false); + } + BufferBuilder bytes_builder_; + int64_t bit_length_ = 0; + int64_t false_count_ = 0; +}; + +} // namespace arrow + +#endif // ARROW_BUFFER_BUILDER_H diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 4d16f7f9c277d..8ff117402f04c 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -26,6 +26,7 @@ #include +#include "arrow/buffer-builder.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" @@ -35,6 +36,22 @@ using std::string; namespace arrow { +TEST(TestAllocate, Bitmap) { + std::shared_ptr new_buffer; + EXPECT_OK(AllocateBitmap(default_memory_pool(), 100, &new_buffer)); + EXPECT_GE(new_buffer->size(), 13); + EXPECT_EQ(new_buffer->capacity() % 8, 0); +} + +TEST(TestAllocate, EmptyBitmap) { + std::shared_ptr new_buffer; + EXPECT_OK(AllocateEmptyBitmap(default_memory_pool(), 100, &new_buffer)); + EXPECT_EQ(new_buffer->size(), 13); + EXPECT_EQ(new_buffer->capacity() % 8, 0); + EXPECT_TRUE(std::all_of(new_buffer->data(), new_buffer->data() + new_buffer->capacity(), + [](int8_t byte) { return byte == 0; })); +} + TEST(TestBuffer, FromStdString) { std::string val = "hello, world"; @@ -176,6 +193,65 @@ TEST(TestBuffer, SliceMutableBuffer) { ASSERT_TRUE(slice->Equals(expected)); } +template +void TestZeroSizeAllocateBuffer(MemoryPool* pool, AllocateFunction&& allocate_func) { + auto allocated_bytes = pool->bytes_allocated(); + { + std::shared_ptr buffer; + + ASSERT_OK(allocate_func(pool, 0, &buffer)); + ASSERT_EQ(buffer->size(), 0); + // Even 0-sized buffers should not have a null data pointer + ASSERT_NE(buffer->data(), nullptr); + ASSERT_EQ(buffer->mutable_data(), buffer->data()); + + ASSERT_GE(pool->bytes_allocated(), allocated_bytes); + } + ASSERT_EQ(pool->bytes_allocated(), allocated_bytes); +} + +TEST(TestAllocateBuffer, ZeroSize) { + MemoryPool* pool = default_memory_pool(); + auto allocate_func = [](MemoryPool* pool, int64_t size, std::shared_ptr* out) { + return AllocateBuffer(pool, size, out); + }; + TestZeroSizeAllocateBuffer(pool, allocate_func); +} + +TEST(TestAllocateResizableBuffer, ZeroSize) { + MemoryPool* pool = default_memory_pool(); + auto allocate_func = [](MemoryPool* pool, int64_t size, std::shared_ptr* out) { + std::shared_ptr res; + RETURN_NOT_OK(AllocateResizableBuffer(pool, size, &res)); + *out = res; + return Status::OK(); + }; + TestZeroSizeAllocateBuffer(pool, allocate_func); +} + +TEST(TestAllocateResizableBuffer, ZeroResize) { + MemoryPool* pool = default_memory_pool(); + auto allocated_bytes = pool->bytes_allocated(); + { + std::shared_ptr buffer; + + ASSERT_OK(AllocateResizableBuffer(pool, 1000, &buffer)); + ASSERT_EQ(buffer->size(), 1000); + ASSERT_NE(buffer->data(), nullptr); + ASSERT_EQ(buffer->mutable_data(), buffer->data()); + + ASSERT_GE(pool->bytes_allocated(), allocated_bytes + 1000); + + ASSERT_OK(buffer->Resize(0)); + ASSERT_NE(buffer->data(), nullptr); + ASSERT_EQ(buffer->mutable_data(), buffer->data()); + + ASSERT_GE(pool->bytes_allocated(), allocated_bytes); + ASSERT_LT(pool->bytes_allocated(), allocated_bytes + 1000); + } + ASSERT_EQ(pool->bytes_allocated(), allocated_bytes); +} + TEST(TestBufferBuilder, ResizeReserve) { const std::string data = "some data"; auto data_ptr = data.c_str(); @@ -201,6 +277,82 @@ TEST(TestBufferBuilder, ResizeReserve) { ASSERT_EQ(128, builder.capacity()); } +template +class TypedTestBufferBuilder : public ::testing::Test {}; + +using BufferBuilderElements = ::testing::Types; + +TYPED_TEST_CASE(TypedTestBufferBuilder, BufferBuilderElements); + +TYPED_TEST(TypedTestBufferBuilder, BasicTypedBufferBuilderUsage) { + TypedBufferBuilder builder; + + ASSERT_OK(builder.Append(static_cast(0))); + ASSERT_EQ(builder.length(), 1); + ASSERT_EQ(builder.capacity(), 64 / sizeof(TypeParam)); + + constexpr int nvalues = 4; + TypeParam values[nvalues]; + for (int i = 0; i != nvalues; ++i) { + values[i] = static_cast(i); + } + ASSERT_OK(builder.Append(values, nvalues)); + ASSERT_EQ(builder.length(), nvalues + 1); + + std::shared_ptr built; + ASSERT_OK(builder.Finish(&built)); + + auto data = reinterpret_cast(built->data()); + ASSERT_EQ(data[0], static_cast(0)); + for (auto value : values) { + ++data; + ASSERT_EQ(*data, value); + } +} + +TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) { + TypedBufferBuilder builder; + + ASSERT_OK(builder.Append(false)); + ASSERT_EQ(builder.length(), 1); + ASSERT_EQ(builder.capacity(), 64 * 8); + + constexpr int nvalues = 4; + uint8_t values[nvalues]; + for (int i = 0; i != nvalues; ++i) { + values[i] = static_cast(i); + } + ASSERT_OK(builder.Append(values, nvalues)); + ASSERT_EQ(builder.length(), nvalues + 1); + + ASSERT_EQ(builder.false_count(), 2); + + std::shared_ptr built; + ASSERT_OK(builder.Finish(&built)); + + ASSERT_EQ(BitUtil::GetBit(built->data(), 0), false); + for (int i = 0; i != nvalues; ++i) { + ASSERT_EQ(BitUtil::GetBit(built->data(), i + 1), static_cast(values[i])); + } +} + +TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) { + TypedBufferBuilder builder; + + ASSERT_OK(builder.Append(13, true)); + ASSERT_OK(builder.Append(17, false)); + ASSERT_EQ(builder.length(), 13 + 17); + ASSERT_EQ(builder.capacity(), 64 * 8); + ASSERT_EQ(builder.false_count(), 17); + + std::shared_ptr built; + ASSERT_OK(builder.Finish(&built)); + + for (int i = 0; i != 13 + 17; ++i) { + EXPECT_EQ(BitUtil::GetBit(built->data(), i), i < 13) << "index = " << i; + } +} + template class TypedTestBuffer : public ::testing::Test {}; diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 01bb0c34968d3..8f05912b80417 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -126,25 +126,18 @@ class PoolBuffer : public ResizableBuffer { } Status Resize(const int64_t new_size, bool shrink_to_fit = true) override { - if (!shrink_to_fit || (new_size > size_)) { - RETURN_NOT_OK(Reserve(new_size)); - } else { - // Buffer is not growing, so shrink to the requested size without + if (mutable_data_ && shrink_to_fit && new_size <= size_) { + // Buffer is non-null and is not growing, so shrink to the requested size without // excess space. int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size); if (capacity_ != new_capacity) { // Buffer hasn't got yet the requested size. - if (new_size == 0) { - pool_->Free(mutable_data_, capacity_); - capacity_ = 0; - mutable_data_ = nullptr; - data_ = nullptr; - } else { - RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &mutable_data_)); - data_ = mutable_data_; - capacity_ = new_capacity; - } + RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &mutable_data_)); + data_ = mutable_data_; + capacity_ = new_capacity; } + } else { + RETURN_NOT_OK(Reserve(new_size)); } size_ = new_size; @@ -219,9 +212,13 @@ Status AllocateResizableBuffer(const int64_t size, return AllocateResizableBuffer(default_memory_pool(), size, out); } +Status AllocateBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { + return AllocateBuffer(pool, BitUtil::BytesForBits(length), out); +} + Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { - RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), out)); + RETURN_NOT_OK(AllocateBitmap(pool, length, out)); memset((*out)->mutable_data(), 0, static_cast((*out)->size())); return Status::OK(); } diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 66c131413c2d3..306e677619fd7 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -19,7 +19,6 @@ #define ARROW_BUFFER_H #include -#include #include #include #include @@ -29,7 +28,6 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -40,13 +38,15 @@ namespace arrow { /// \class Buffer /// \brief Object containing a pointer to a piece of contiguous memory with a -/// particular size. Base class does not own its memory +/// particular size. /// /// Buffers have two related notions of length: size and capacity. Size is /// the number of bytes that might have valid data. Capacity is the number -/// of bytes that where allocated for the buffer in total. +/// of bytes that were allocated for the buffer in total. /// -/// The following invariant is always true: Size < Capacity +/// The Buffer base class does not own its memory, but subclasses often do. +/// +/// The following invariant is always true: Size <= Capacity class ARROW_EXPORT Buffer { public: /// \brief Construct from buffer and size without copying memory @@ -108,7 +108,10 @@ class ARROW_EXPORT Buffer { #ifndef NDEBUG CheckMutable(); #endif - memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); + // A zero-capacity buffer can have a null data pointer + if (capacity_ != 0) { + memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); + } } /// \brief Construct a new buffer that owns its memory from a std::string @@ -158,9 +161,12 @@ class ARROW_EXPORT Buffer { /// \note Can throw std::bad_alloc if buffer is large std::string ToString() const; - int64_t capacity() const { return capacity_; } + /// \brief Return a pointer to the buffer's data const uint8_t* data() const { return data_; } - + /// \brief Return a writable pointer to the buffer's data + /// + /// The buffer has to be mutable. Otherwise, an assertion may be thrown + /// or a null pointer may be returned. uint8_t* mutable_data() { #ifndef NDEBUG CheckMutable(); @@ -168,8 +174,12 @@ class ARROW_EXPORT Buffer { return mutable_data_; } + /// \brief Return the buffer's size in bytes int64_t size() const { return size_; } + /// \brief Return the buffer's capacity (number of allocated bytes) + int64_t capacity() const { return capacity_; } + std::shared_ptr parent() const { return parent_; } protected: @@ -188,26 +198,38 @@ class ARROW_EXPORT Buffer { ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; -/// Construct a view on passed buffer at the indicated offset and length. This -/// function cannot fail and does not error checking (except in debug builds) +/// \defgroup buffer-slicing-functions Functions for slicing buffers +/// +/// @{ + +/// \brief Construct a view on a buffer at the given offset and length. +/// +/// This function cannot fail and does not check for errors (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length) { return std::make_shared(buffer, offset, length); } +/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. +/// +/// This function cannot fail and does not check for errors (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, const int64_t offset) { int64_t length = buffer->size() - offset; return SliceBuffer(buffer, offset, length); } -/// Construct a mutable buffer slice. If the parent buffer is not mutable, this -/// will abort in debug builds +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). ARROW_EXPORT std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length); +/// @} + /// \class MutableBuffer /// \brief A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { @@ -241,10 +263,11 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { /// Change buffer reported size to indicated size, allocating memory if /// necessary. This will ensure that the capacity of the buffer is a multiple /// of 64 bytes as defined in Layout.md. - /// Consider using ZeroPadding afterwards, in case you return buffer to a reader. + /// Consider using ZeroPadding afterwards, to conform to the Arrow layout + /// specification. /// - /// @param shrink_to_fit On deactivating this option, the capacity of the Buffer won't - /// decrease. + /// @param new_size The new size for the buffer. + /// @param shrink_to_fit Whether to shrink the capacity if new size < current size virtual Status Resize(const int64_t new_size, bool shrink_to_fit = true) = 0; /// Ensure that buffer has enough memory allocated to fit the indicated @@ -266,6 +289,10 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} }; +/// \defgroup buffer-allocation-functions Functions for allocating buffers +/// +/// @{ + /// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. /// /// \param[in] pool a memory pool @@ -344,7 +371,8 @@ Status AllocateResizableBuffer(const int64_t size, std::shared_ptr* out); -/// \brief Allocate a zero-initialized bitmap buffer from a memory pool +/// \brief Allocate a bitmap buffer from a memory pool +/// no guarantee on values is provided. /// /// \param[in] pool memory pool to allocate memory from /// \param[in] length size in bits of bitmap to allocate @@ -352,6 +380,16 @@ Status AllocateResizableBuffer(const int64_t size, std::unique_ptr* out); + +/// \brief Allocate a zero-initialized bitmap buffer from a memory pool +/// +/// \param[in] pool memory pool to allocate memory from +/// \param[in] length size in bits of bitmap to allocate +/// \param[out] out the resulting buffer (zero-initialized). +/// +/// \return Status message +ARROW_EXPORT Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out); @@ -364,150 +402,7 @@ Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, ARROW_EXPORT Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); -// ---------------------------------------------------------------------- -// Buffer builder classes - -/// \class BufferBuilder -/// \brief A class for incrementally building a contiguous chunk of in-memory data -class ARROW_EXPORT BufferBuilder { - public: - explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} - - /// \brief Resizes the buffer to the nearest multiple of 64 bytes - /// - /// \param elements the new capacity of the of the builder. Will be rounded - /// up to a multiple of 64 bytes for padding - /// \param shrink_to_fit if new capacity smaller than existing size, - /// reallocate internal buffer. Set to false to avoid reallocations when - /// shrinking the builder - /// \return Status - Status Resize(const int64_t elements, bool shrink_to_fit = true) { - // Resize(0) is a no-op - if (elements == 0) { - return Status::OK(); - } - int64_t old_capacity = capacity_; - - if (buffer_ == NULLPTR) { - ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, elements, &buffer_)); - } else { - ARROW_RETURN_NOT_OK(buffer_->Resize(elements, shrink_to_fit)); - } - capacity_ = buffer_->capacity(); - data_ = buffer_->mutable_data(); - if (capacity_ > old_capacity) { - memset(data_ + old_capacity, 0, capacity_ - old_capacity); - } - return Status::OK(); - } - - /// \brief Ensure that builder can accommodate the additional number of bytes - /// without the need to perform allocations - /// - /// \param size number of additional bytes to make space for - /// \return Status - Status Reserve(const int64_t size) { return Resize(size_ + size, false); } - - Status Append(const void* data, int64_t length) { - if (capacity_ < length + size_) { - int64_t new_capacity = BitUtil::NextPower2(length + size_); - ARROW_RETURN_NOT_OK(Resize(new_capacity)); - } - UnsafeAppend(data, length); - return Status::OK(); - } - - template - Status Append(const std::array& data) { - constexpr auto nbytes = static_cast(NBYTES); - if (capacity_ < nbytes + size_) { - int64_t new_capacity = BitUtil::NextPower2(nbytes + size_); - ARROW_RETURN_NOT_OK(Resize(new_capacity)); - } - - std::copy(data.cbegin(), data.cend(), data_ + size_); - size_ += nbytes; - return Status::OK(); - } - - // Advance pointer and zero out memory - Status Advance(const int64_t length) { - if (capacity_ < length + size_) { - int64_t new_capacity = BitUtil::NextPower2(length + size_); - ARROW_RETURN_NOT_OK(Resize(new_capacity)); - } - memset(data_ + size_, 0, static_cast(length)); - size_ += length; - return Status::OK(); - } - - // Unsafe methods don't check existing size - void UnsafeAppend(const void* data, int64_t length) { - memcpy(data_ + size_, data, static_cast(length)); - size_ += length; - } - - Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { - ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); - *out = buffer_; - Reset(); - return Status::OK(); - } - - void Reset() { - buffer_ = NULLPTR; - capacity_ = size_ = 0; - } - - int64_t capacity() const { return capacity_; } - int64_t length() const { return size_; } - const uint8_t* data() const { return data_; } - - protected: - std::shared_ptr buffer_; - MemoryPool* pool_; - uint8_t* data_; - int64_t capacity_; - int64_t size_; -}; - -template -class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { - public: - explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {} - - Status Append(T arithmetic_value) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - return BufferBuilder::Append(reinterpret_cast(&arithmetic_value), - sizeof(T)); - } - - Status Append(const T* arithmetic_values, int64_t num_elements) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - return BufferBuilder::Append(reinterpret_cast(arithmetic_values), - num_elements * sizeof(T)); - } - - void UnsafeAppend(T arithmetic_value) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - BufferBuilder::UnsafeAppend(reinterpret_cast(&arithmetic_value), sizeof(T)); - } - - void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - BufferBuilder::UnsafeAppend(reinterpret_cast(arithmetic_values), - num_elements * sizeof(T)); - } - - const T* data() const { return reinterpret_cast(data_); } - int64_t length() const { return size_ / sizeof(T); } - int64_t capacity() const { return capacity_ / sizeof(T); } -}; +/// @} } // namespace arrow diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index f96728dcd4fdf..e4a56bf103ef7 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -148,7 +149,7 @@ static void BM_BuildBooleanArrayNoNulls( constexpr uint8_t bit_pattern = 0xcc; // 0b11001100 uint64_t index = 0; std::generate(data.begin(), data.end(), - [&index]() -> uint8_t { return (bit_pattern >> ((index++) % 8)) & 1; }); + [&]() -> uint8_t { return (bit_pattern >> ((index++) % 8)) & 1; }); while (state.KeepRunning()) { BooleanBuilder builder; @@ -163,10 +164,11 @@ static void BM_BuildBooleanArrayNoNulls( } static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1 << 20; - + // About 160MB + const int64_t iterations = 1 << 24; std::string value = "1234567890"; - while (state.KeepRunning()) { + + for (auto _ : state) { BinaryBuilder builder; for (int64_t i = 0; i < iterations; i++) { ABORT_NOT_OK(builder.Append(value)); @@ -177,6 +179,26 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const state.SetBytesProcessed(state.iterations() * iterations * value.size()); } +static void BM_BuildChunkedBinaryArray( + benchmark::State& state) { // NOLINT non-const reference + // About 160MB + const int64_t iterations = 1 << 24; + std::string value = "1234567890"; + + for (auto _ : state) { + // 1MB chunks + const int32_t chunksize = 1 << 20; + internal::ChunkedBinaryBuilder builder(chunksize); + for (int64_t i = 0; i < iterations; i++) { + ABORT_NOT_OK(builder.Append(reinterpret_cast(value.data()), + static_cast(value.size()))); + } + ArrayVector out; + ABORT_NOT_OK(builder.Finish(&out)); + } + state.SetBytesProcessed(state.iterations() * iterations * value.size()); +} + static void BM_BuildFixedSizeBinaryArray( benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1 << 20; @@ -271,13 +293,13 @@ static std::vector MakeStringDictFodder(int32_t n_values, *it++ = "abcfgh"; // Add random strings std::uniform_int_distribution length_dist(2, 20); - std::independent_bits_engine bytes_gen(42); + std::independent_bits_engine bytes_gen(42); - std::generate(it, values_dict.end(), [&]() { + std::generate(it, values_dict.end(), [&] { auto length = length_dist(gen); std::string s(length, 'X'); for (int32_t i = 0; i < length; ++i) { - s[i] = bytes_gen(); + s[i] = static_cast(bytes_gen()); } return s; }); @@ -285,7 +307,7 @@ static std::vector MakeStringDictFodder(int32_t n_values, { std::uniform_int_distribution indices_dist(0, n_distinct - 1); std::generate(values.begin(), values.end(), - [&]() { return values_dict[indices_dist(gen)]; }); + [&] { return values_dict[indices_dist(gen)]; }); } return values; } @@ -328,7 +350,7 @@ static void BM_BuildStringDictionaryArray( const auto fodder = MakeStringDictFodder(10000, 100); auto type = binary(); auto fodder_size = - std::accumulate(fodder.begin(), fodder.end(), 0, + std::accumulate(fodder.begin(), fodder.end(), static_cast(0), [&](size_t acc, const std::string& s) { return acc + s.size(); }); while (state.KeepRunning()) { @@ -371,10 +393,9 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildBinaryArray)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildFixedSizeBinaryArray) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildChunkedBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildFixedSizeBinaryArray)->MinTime(3.0)->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildInt64DictionaryArrayRandom) ->Repetitions(kRepetitions) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index aef4df05108b7..2072edc936a3c 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -15,513 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include -#include +#include "arrow/builder.h" + #include +#include #include #include -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int-util.h" -#include "arrow/util/logging.h" namespace arrow { -using internal::checked_cast; - -Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { - if (buffer) { - if (bytes_filled < buffer->size()) { - // Trim buffer - RETURN_NOT_OK(buffer->Resize(bytes_filled)); - } - // zero the padding - buffer->ZeroPadding(); - } else { - // Null buffers are allowed in place of 0-byte buffers - DCHECK_EQ(bytes_filled, 0); - } - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(bool is_valid) { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - // TODO(emkornfield) doubling isn't great default allocation practice - // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md - // fo discussion - RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); - } - UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - RETURN_NOT_OK(Reserve(length)); - - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status ArrayBuilder::Resize(int64_t capacity) { - // Target size of validity (null) bitmap data - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Padding is zeroed by AllocateResizableBuffer - memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = null_bitmap_->capacity(); - RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); - - const int64_t new_bitmap_capacity = null_bitmap_->capacity(); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Zero the region between the original capacity and the new capacity, - // including padding, which has not been zeroed, unlike - // AllocateResizableBuffer - if (old_bitmap_capacity < new_bitmap_capacity) { - memset(null_bitmap_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - } - capacity_ = capacity; - return Status::OK(); -} - -Status ArrayBuilder::Advance(int64_t elements) { - if (length_ + elements > capacity_) { - return Status::Invalid("Builder must be expanded"); - } - length_ += elements; - return Status::OK(); -} - -Status ArrayBuilder::Finish(std::shared_ptr* out) { - std::shared_ptr internal_data; - RETURN_NOT_OK(FinishInternal(&internal_data)); - *out = MakeArray(internal_data); - return Status::OK(); -} - -Status ArrayBuilder::Reserve(int64_t additional_elements) { - if (length_ + additional_elements > capacity_) { - // TODO(emkornfield) power of 2 growth is potentially suboptimal - int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); - return Resize(new_size); - } - return Status::OK(); -} - -void ArrayBuilder::Reset() { - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; -} - -Status ArrayBuilder::SetNotNull(int64_t length) { - RETURN_NOT_OK(Reserve(length)); - UnsafeSetNotNull(length); - return Status::OK(); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - if (valid_bytes == nullptr) { - UnsafeSetNotNull(length); - return; - } - UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { - UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); -} - -void ArrayBuilder::UnsafeSetNotNull(int64_t length) { - const int64_t new_length = length + length_; - - // Fill up the bytes until we have a byte alignment - int64_t pad_to_byte = std::min(8 - (length_ % 8), length); - - if (pad_to_byte == 8) { - pad_to_byte = 0; - } - for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - // Fast bitsetting - int64_t fast_length = (length - pad_to_byte) / 8; - memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, - static_cast(fast_length)); - - // Trailing bits - for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - length_ = new_length; -} - -// ---------------------------------------------------------------------- -// Null builder - -Status NullBuilder::FinishInternal(std::shared_ptr* out) { - *out = ArrayData::Make(null(), length_, {nullptr}, length_); - length_ = null_count_ = 0; - return Status::OK(); -} - -// ---------------------------------------------------------------------- - -template -void PrimitiveBuilder::Reset() { - data_.reset(); - raw_data_ = nullptr; -} - -template -Status PrimitiveBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - int64_t nbytes = TypeTraits::bytes_required(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); - } else { - RETURN_NOT_OK(data_->Resize(nbytes)); - } - - raw_data_ = reinterpret_cast(data_->mutable_data()); - return ArrayBuilder::Resize(capacity); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -template -Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return Status::OK(); -} - -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - -BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} - -BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) - : BooleanBuilder(pool) { - DCHECK_EQ(Type::BOOL, type->id()); -} - -void BooleanBuilder::Reset() { - ArrayBuilder::Reset(); - data_.reset(); - raw_data_ = nullptr; -} - -Status BooleanBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // We zero the memory for booleans to keep things simple; for some reason if - // we do not, even though we may write every bit (through in-place | or &), - // valgrind will still show a warning. If we do not zero the bytes here, we - // will have to be careful to zero them in AppendNull and AppendNulls. Also, - // zeroing the bits results in deterministic bits when each byte may have a - // mix of nulls and not nulls. - // - // We only zero up to new_bitmap_size because the padding was zeroed by - // AllocateResizableBuffer - memset(raw_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = data_->capacity(); - RETURN_NOT_OK(data_->Resize(new_bitmap_size)); - const int64_t new_bitmap_capacity = data_->capacity(); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // See comment above about why we zero memory for booleans - memset(raw_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - - return ArrayBuilder::Resize(capacity); -} - -Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { - int64_t bit_offset = length_ % 8; - if (bit_offset > 0) { - // Adjust last byte - data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; - } - - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); - - *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++] != 0; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeSetNotNull(length); - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// ListBuilder - -ListBuilder::ListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type) - : ArrayBuilder(type ? type - : std::static_pointer_cast( - std::make_shared(value_builder->type())), - pool), - offsets_builder_(pool), - value_builder_(value_builder) {} - -Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); -} - -Status ListBuilder::AppendNextOffset() { - int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { - std::stringstream ss; - ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," - << " have " << num_values; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_values)); -} - -Status ListBuilder::Append(bool is_valid) { - RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return AppendNextOffset(); -} - -Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LE(capacity, kListMaximumElements); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); -} - -Status ListBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - - std::shared_ptr items; - if (values_) { - items = values_->data(); - } else { - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - RETURN_NOT_OK(value_builder_->Resize(0)); - } - RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - } - - *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); - (*out)->child_data.emplace_back(std::move(items)); - Reset(); - return Status::OK(); -} - -void ListBuilder::Reset() { - ArrayBuilder::Reset(); - values_.reset(); - offsets_builder_.Reset(); - value_builder_->Reset(); -} - -ArrayBuilder* ListBuilder::value_builder() const { - DCHECK(!values_) << "Using value builder is pointless when values_ is set"; - return value_builder_.get(); -} - -// ---------------------------------------------------------------------- -// Struct - -StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders) - : ArrayBuilder(type, pool), field_builders_(std::move(field_builders)) {} - -void StructBuilder::Reset() { - ArrayBuilder::Reset(); - for (const auto& field_builder : field_builders_) { - field_builder->Reset(); - } -} - -Status StructBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); - - (*out)->child_data.resize(field_builders_.size()); - for (size_t i = 0; i < field_builders_.size(); ++i) { - if (length_ == 0) { - // Try to make sure the child buffers are initialized - RETURN_NOT_OK(field_builders_[i]->Resize(0)); - } - RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i])); - } - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} +class MemoryPool; // ---------------------------------------------------------------------- // Helper functions @@ -566,7 +73,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = - checked_cast(*type).value_type(); + internal::checked_cast(*type).value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, std::move(value_builder))); return Status::OK(); @@ -586,9 +93,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } default: { - std::stringstream ss; - ss << "MakeBuilder: cannot construct builder for type " << type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("MakeBuilder: cannot construct builder for type ", + type->ToString()); } } } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 34398eebebfb6..a7ab22c1beedb 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -15,1189 +15,27 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_BUILDER_H -#define ARROW_BUILDER_H +#pragma once -#include // IWYU pragma: keep -#include -#include -#include -#include -#include #include -#include -#include -#include -#include "arrow/buffer.h" -#include "arrow/memory_pool.h" +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_binary.h" // IWYU pragma: export +#include "arrow/array/builder_decimal.h" // IWYU pragma: export +#include "arrow/array/builder_dict.h" // IWYU pragma: export +#include "arrow/array/builder_nested.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" namespace arrow { -class Array; -struct ArrayData; -class Decimal128; - -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; - -constexpr int64_t kMinBuilderCapacity = 1 << 5; - -/// Base class for all data array builders. -/// -/// This class provides a facilities for incrementally building the null bitmap -/// (see Append methods) and as a side effect the current number of slots and -/// the null count. -/// -/// \note Users are expected to use builders as one of the concrete types below. -/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. -class ARROW_EXPORT ArrayBuilder { - public: - explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) - : type_(type), - pool_(pool), - null_bitmap_(NULLPTR), - null_count_(0), - null_bitmap_data_(NULLPTR), - length_(0), - capacity_(0) {} - - virtual ~ArrayBuilder() = default; - - /// For nested types. Since the objects are owned by this class instance, we - /// skip shared pointers and just return a raw pointer - ArrayBuilder* child(int i) { return children_[i].get(); } - - int num_children() const { return static_cast(children_.size()); } - - int64_t length() const { return length_; } - int64_t null_count() const { return null_count_; } - int64_t capacity() const { return capacity_; } - - /// \brief Ensure that enough memory has been allocated to fit the indicated - /// number of total elements in the builder, including any that have already - /// been appended. Does not account for reallocations that may be due to - /// variable size data, like binary values. To make space for incremental - /// appends, use Reserve instead. - /// - /// \param[in] capacity the minimum number of total array values to - /// accommodate. Must be greater than the current capacity. - /// \return Status - virtual Status Resize(int64_t capacity); - - /// \brief Ensure that there is enough space allocated to add the indicated - /// number of elements without any further calls to Resize. The memory - /// allocated is rounded up to the next highest power of 2 similar to memory - /// allocations in STL containers like std::vector - /// \param[in] additional_capacity the number of additional array values - /// \return Status - Status Reserve(int64_t additional_capacity); - - /// Reset the builder. - virtual void Reset(); - - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - Status Advance(int64_t elements); - - /// \brief Return result of builder as an internal generic ArrayData - /// object. Resets builder except for dictionary builder - /// - /// \param[out] out the finalized ArrayData object - /// \return Status - virtual Status FinishInternal(std::shared_ptr* out) = 0; - - /// \brief Return result of builder as an Array object. - /// Resets the builder except for DictionaryBuilder - /// - /// \param[out] out the finalized Array object - /// \return Status - Status Finish(std::shared_ptr* out); - - std::shared_ptr type() const { return type_; } - - protected: - ArrayBuilder() {} - - /// Append to null bitmap - Status AppendToBitmap(bool is_valid); - - /// Vector append. Treat each zero byte as a null. If valid_bytes is null - /// assume all of length bits are valid. - Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - /// Set the next length bits to not null (i.e. valid). - Status SetNotNull(int64_t length); - - // Unsafe operations (don't check capacity/don't resize) - - void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } - - // Append to null bitmap, update the length - void UnsafeAppendToBitmap(bool is_valid) { - if (is_valid) { - BitUtil::SetBit(null_bitmap_data_, length_); - } else { - ++null_count_; - } - ++length_; - } - - template - void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { - int64_t byte_offset = length_ / 8; - int64_t bit_offset = length_ % 8; - uint8_t bitset = null_bitmap_data_[byte_offset]; - - for (auto iter = begin; iter != end; ++iter) { - if (bit_offset == 8) { - bit_offset = 0; - null_bitmap_data_[byte_offset] = bitset; - byte_offset++; - // TODO: Except for the last byte, this shouldn't be needed - bitset = null_bitmap_data_[byte_offset]; - } - - if (*iter) { - bitset |= BitUtil::kBitmask[bit_offset]; - } else { - bitset &= BitUtil::kFlippedBitmask[bit_offset]; - ++null_count_; - } - - bit_offset++; - } - - if (bit_offset != 0) { - null_bitmap_data_[byte_offset] = bitset; - } - - length_ += std::distance(begin, end); - } - - // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null - // assume all of length bits are valid. - void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - void UnsafeAppendToBitmap(const std::vector& is_valid); - - // Set the next length bits to not null (i.e. valid). - void UnsafeSetNotNull(int64_t length); - - static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); - - static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { - if (new_capacity < 0) { - return Status::Invalid("Resize capacity must be positive"); - } - if (new_capacity < old_capacity) { - return Status::Invalid("Resize cannot downsize"); - } - return Status::OK(); - } - - std::shared_ptr type_; - MemoryPool* pool_; - - // When null_bitmap are first appended to the builder, the null bitmap is allocated - std::shared_ptr null_bitmap_; - int64_t null_count_; - uint8_t* null_bitmap_data_; - - // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; - - // Child value array builders. These are owned by this class - std::vector> children_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); -}; - -class ARROW_EXPORT NullBuilder : public ArrayBuilder { - public: - explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(null(), pool) {} - - Status AppendNull() { - ++null_count_; - ++length_; - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; -}; - -template -class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { - public: - using value_type = typename Type::c_type; - - explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} - - using ArrayBuilder::Advance; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - /// The memory at the corresponding data slot is set to 0 to prevent uninitialized - /// memory access - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(raw_data_ + length_, 0, - static_cast(TypeTraits::bytes_required(length))); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - memset(raw_data_ + length_, 0, sizeof(value_type)); - UnsafeAppendToBitmap(false); - return Status::OK(); - } - - value_type GetValue(int64_t index) const { - return reinterpret_cast(data_->data())[index]; - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \return Status - - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin uint8_t* indication valid(1) or null(0) values. - /// nullptr indicates all values are valid. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - value_type* raw_data_; -}; - -/// Base class for all Builders that emit an Array of a scalar numerical type. -template -class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { - public: - using typename PrimitiveBuilder::value_type; - using PrimitiveBuilder::PrimitiveBuilder; - - template - explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool - ARROW_MEMORY_POOL_DEFAULT) - : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} - - using ArrayBuilder::UnsafeAppendNull; - using PrimitiveBuilder::AppendValues; - using PrimitiveBuilder::Resize; - using PrimitiveBuilder::Reserve; - - /// Append a single scalar and increase the size if necessary. - Status Append(const value_type val) { - ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - /// Append a single scalar under the assumption that the underlying Buffer is - /// large enough. - /// - /// This method does not capacity-check; make sure to call Reserve - /// beforehand. - void UnsafeAppend(const value_type val) { - BitUtil::SetBit(null_bitmap_data_, length_); - raw_data_[length_++] = val; - } - - protected: - using PrimitiveBuilder::length_; - using PrimitiveBuilder::null_bitmap_data_; - using PrimitiveBuilder::raw_data_; -}; - -// Builders - -using UInt8Builder = NumericBuilder; -using UInt16Builder = NumericBuilder; -using UInt32Builder = NumericBuilder; -using UInt64Builder = NumericBuilder; - -using Int8Builder = NumericBuilder; -using Int16Builder = NumericBuilder; -using Int32Builder = NumericBuilder; -using Int64Builder = NumericBuilder; -using TimestampBuilder = NumericBuilder; -using Time32Builder = NumericBuilder; -using Time64Builder = NumericBuilder; -using Date32Builder = NumericBuilder; -using Date64Builder = NumericBuilder; - -using HalfFloatBuilder = NumericBuilder; -using FloatBuilder = NumericBuilder; -using DoubleBuilder = NumericBuilder; - -namespace internal { - -class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { - public: - explicit AdaptiveIntBuilderBase(MemoryPool* pool); - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(CommitPendingData()); - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - Status AppendNull() { - pending_data_[pending_pos_] = 0; - pending_valid_[pending_pos_] = 0; - pending_has_nulls_ = true; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - virtual Status CommitPendingData() = 0; - - std::shared_ptr data_; - uint8_t* raw_data_; - uint8_t int_size_; - - static constexpr int32_t pending_size_ = 1024; - uint8_t pending_valid_[pending_size_]; - uint64_t pending_data_[pending_size_]; - int32_t pending_pos_; - bool pending_has_nulls_; -}; - -} // namespace internal - -class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const uint64_t val) { - pending_data_[pending_pos_] = val; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const int64_t val) { - auto v = static_cast(val); - - pending_data_[pending_pos_] = v; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const int64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const int64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { - public: - using value_type = bool; - explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); - - using ArrayBuilder::Advance; - using ArrayBuilder::UnsafeAppendNull; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - - return Status::OK(); - } - - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - - return Status::OK(); - } - - /// Scalar append - Status Append(const bool val) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - Status Append(const uint8_t val) { return Append(val != 0); } - - /// Scalar append, without checking for capacity - void UnsafeAppend(const bool val) { - BitUtil::SetBit(null_bitmap_data_, length_); - if (val) { - BitUtil::SetBit(raw_data_, length_); - } else { - BitUtil::ClearBit(raw_data_, length_); - } - ++length_; - } - - void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous array of bytes (non-zero is 1) - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// or null(0) values - /// \return Status - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin uint8_t* indication valid(1) or null(0) values. - /// nullptr indicates all values are valid. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - uint8_t* raw_data_; -}; - -// ---------------------------------------------------------------------- -// List builder - -/// \class ListBuilder -/// \brief Builder class for variable-length list array value types -/// -/// To use this class, you must append values to the child array builder and use -/// the Append function to delimit each distinct list value (once the values -/// have been appended to the child array) or use the bulk API to append -/// a sequence of offests and null values. -/// -/// A note on types. Per arrow/type.h all types in the c++ implementation are -/// logical so even though this class always builds list array, this can -/// represent multiple different logical types. If no logical type is provided -/// at construction time, the class defaults to List where t is taken from the -/// value_builder/values that the object is constructed with. -class ARROW_EXPORT ListBuilder : public ArrayBuilder { - public: - /// Use this constructor to incrementally build the value array along with offsets and - /// null bitmap. - ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type = NULLPTR); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Start a new variable-length list slot - /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true); - - Status AppendNull() { return Append(false); } - - ArrayBuilder* value_builder() const; - - protected: - TypedBufferBuilder offsets_builder_; - std::shared_ptr value_builder_; - std::shared_ptr values_; - - Status AppendNextOffset(); -}; - -// ---------------------------------------------------------------------- -// Binary and String - -/// \class BinaryBuilder -/// \brief Builder class for variable-length binary data -class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { - public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - Status Append(const uint8_t* value, int32_t length); - - Status Append(const char* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(util::string_view value) { - return Append(value.data(), static_cast(value.size())); - } - - Status AppendNull(); - - /// \brief Append without checking capacity - /// - /// Offsets and data should have been presized using Reserve() and - /// ReserveData(), respectively. - void UnsafeAppend(const uint8_t* value, int32_t length) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(value, length); - UnsafeAppendToBitmap(true); - } - - void UnsafeAppend(const char* value, int32_t length) { - UnsafeAppend(reinterpret_cast(value), length); - } - - void UnsafeAppend(const std::string& value) { - UnsafeAppend(value.c_str(), static_cast(value.size())); - } - - void UnsafeAppendNull() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - UnsafeAppendToBitmap(false); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - /// \brief Ensures there is enough allocated capacity to append the indicated - /// number of bytes to the value data buffer without additional allocations - Status ReserveData(int64_t elements); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return value_data_builder_.length(); } - /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i, int32_t* out_length) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - TypedBufferBuilder offsets_builder_; - TypedBufferBuilder value_data_builder_; - - Status AppendNextOffset(); - - void UnsafeAppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - } -}; - -/// \class StringBuilder -/// \brief Builder class for UTF8 strings -class ARROW_EXPORT StringBuilder : public BinaryBuilder { - public: - using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using BinaryBuilder::Append; - using BinaryBuilder::Reset; - using BinaryBuilder::UnsafeAppend; - - /// \brief Append a sequence of strings in one shot. - /// - /// \param[in] values a vector of strings - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const std::vector& values, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of nul-terminated strings in one shot. - /// If one of the values is NULL, it is processed as a null - /// value even if the corresponding valid_bytes entry is 1. - /// - /// \param[in] values a contiguous C array of nul-terminated char * - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); -}; - -// ---------------------------------------------------------------------- -// FixedSizeBinaryBuilder - -class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { - public: - FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - Status Append(const uint8_t* value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value, byte_width_); - } - - Status Append(const char* value) { - return Append(reinterpret_cast(value)); - } - - Status Append(const util::string_view& view) { -#ifndef NDEBUG - CheckValueSize(static_cast(view.size())); -#endif - return Append(reinterpret_cast(view.data())); - } - - Status Append(const std::string& s) { -#ifndef NDEBUG - CheckValueSize(static_cast(s.size())); -#endif - return Append(reinterpret_cast(s.data())); - } - - template - Status Append(const std::array& value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value); - } - - Status AppendValues(const uint8_t* data, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - Status AppendNull(); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return byte_builder_.length(); } - - int32_t byte_width() const { return byte_width_; } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - int32_t byte_width_; - BufferBuilder byte_builder_; - -#ifndef NDEBUG - void CheckValueSize(int64_t size); -#endif -}; - -class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { - public: - explicit Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using FixedSizeBinaryBuilder::Append; - using FixedSizeBinaryBuilder::AppendValues; - using FixedSizeBinaryBuilder::Reset; - - Status Append(const Decimal128& val); - - Status FinishInternal(std::shared_ptr* out) override; -}; - -using DecimalBuilder = Decimal128Builder; - -// ---------------------------------------------------------------------- -// Struct - -// --------------------------------------------------------------------------------- -// StructArray builder -/// Append, Resize and Reserve methods are acting on StructBuilder. -/// Please make sure all these methods of all child-builders' are consistently -/// called to maintain data-structure consistency. -class ARROW_EXPORT StructBuilder : public ArrayBuilder { - public: - StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); - - Status FinishInternal(std::shared_ptr* out) override; - - /// Null bitmap is of equal length to every child field, and any zero byte - /// will be considered as a null for that field, but users must using app- - /// end methods or advance methods of the child builders' independently to - /// insert data. - Status AppendValues(int64_t length, const uint8_t* valid_bytes) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// Append an element to the Struct. All child-builders' Append method must - /// be called independently to maintain data-structure consistency. - Status Append(bool is_valid = true) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return Status::OK(); - } - - Status AppendNull() { return Append(false); } - - void Reset() override; - - ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); } - - int num_fields() const { return static_cast(field_builders_.size()); } - - protected: - std::vector> field_builders_; -}; - -// ---------------------------------------------------------------------- -// Dictionary builder - -namespace internal { - -template -struct DictionaryScalar { - using type = typename T::c_type; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -} // namespace internal - -/// \brief Array builder for created encoded DictionaryArray from dense array -/// -/// Unlike other builders, dictionary builder does not completely reset the state -/// on Finish calls. The arrays built after the initial Finish call will reuse -/// the previously created encoding and build a delta dictionary when new terms -/// occur. -/// -/// data -template -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - using Scalar = typename internal::DictionaryScalar::type; - - // WARNING: the type given below is the value type, not the DictionaryType. - // The DictionaryType is instantiated on the Finish() call. - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - template - explicit DictionaryBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - - ~DictionaryBuilder() override; - - /// \brief Append a scalar value - Status Append(const Scalar& value); - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const uint8_t*>::type value) { - return Append(util::string_view(reinterpret_cast(value), byte_width_)); - } - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const char*>::type value) { - return Append(util::string_view(value, byte_width_)); - } - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// is the dictionary builder in the delta building mode - bool is_building_delta() { return delta_offset_ > 0; } - - protected: - class MemoTableImpl; - std::unique_ptr memo_table_; - - int32_t delta_offset_; - // Only used for FixedSizeBinaryType - int32_t byte_width_; - - AdaptiveIntBuilder values_builder_; -}; - -template <> -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - explicit DictionaryBuilder(MemoryPool* pool); - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - protected: - AdaptiveIntBuilder values_builder_; -}; - -class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -/// \brief Dictionary array builder with convenience methods for strings -class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -// ---------------------------------------------------------------------- -// Helper functions +class DataType; +class MemoryPool; ARROW_EXPORT Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out); } // namespace arrow - -#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2f4f5d16364f1..114752934c9f6 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -30,6 +30,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -324,7 +325,15 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r right_data = right.values()->data() + right.offset() * byte_width; } - if (left.null_count() > 0) { + if (byte_width == 0) { + // Special case 0-width data, as the data pointers may be null + for (int64_t i = 0; i < left.length(); ++i) { + if (left.IsNull(i) != right.IsNull(i)) { + return false; + } + } + return true; + } else if (left.null_count() > 0) { for (int64_t i = 0; i < left.length(); ++i) { const bool left_null = left.IsNull(i); const bool right_null = right.IsNull(i); @@ -774,6 +783,98 @@ bool TensorEquals(const Tensor& left, const Tensor& right) { return are_equal; } +namespace { + +template +struct SparseTensorEqualsImpl { + static bool Compare(const SparseTensorImpl& left, + const SparseTensorImpl& right) { + // TODO(mrkn): should we support the equality among different formats? + return false; + } +}; + +template +struct SparseTensorEqualsImpl { + static bool Compare(const SparseTensorImpl& left, + const SparseTensorImpl& right) { + DCHECK(left.type()->id() == right.type()->id()); + DCHECK(left.shape() == right.shape()); + DCHECK(left.non_zero_length() == right.non_zero_length()); + + const auto& left_index = checked_cast(*left.sparse_index()); + const auto& right_index = checked_cast(*right.sparse_index()); + + if (!left_index.Equals(right_index)) { + return false; + } + + const auto& size_meta = dynamic_cast(*left.type()); + const int byte_width = size_meta.bit_width() / CHAR_BIT; + DCHECK_GT(byte_width, 0); + + const uint8_t* left_data = left.data()->data(); + const uint8_t* right_data = right.data()->data(); + + return memcmp(left_data, right_data, + static_cast(byte_width * left.non_zero_length())); + } +}; + +template +inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl& left, + const SparseTensor& right) { + switch (right.format_id()) { + case SparseTensorFormat::COO: { + const auto& right_coo = + checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_coo); + } + + case SparseTensorFormat::CSR: { + const auto& right_csr = + checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csr); + } + + default: + return false; + } +} + +} // namespace + +bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { + if (&left == &right) { + return true; + } else if (left.type()->id() != right.type()->id()) { + return false; + } else if (left.size() == 0) { + return true; + } else if (left.shape() != right.shape()) { + return false; + } else if (left.non_zero_length() != right.non_zero_length()) { + return false; + } + + switch (left.format_id()) { + case SparseTensorFormat::COO: { + const auto& left_coo = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_coo, right); + } + + case SparseTensorFormat::CSR: { + const auto& left_csr = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csr, right); + } + + default: + return false; + } +} + bool TypeEquals(const DataType& left, const DataType& right) { bool are_equal; // The arrays are the same object diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 21e2fdc24f19c..d49d7cc0fdb08 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -29,12 +29,16 @@ namespace arrow { class Array; class DataType; class Tensor; +class SparseTensor; /// Returns true if the arrays are exactly equal bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); +/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal +bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right); + /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right); diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index d4369ed27b7c4..75d152b0bafa3 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -15,20 +15,10 @@ # specific language governing permissions and limitations # under the License. -# Headers: top level -install(FILES - api.h - context.h - kernel.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute") +ARROW_INSTALL_ALL_HEADERS("arrow/compute") # pkg-config support -configure_file(arrow-compute.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-compute.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-compute.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-compute") ####################################### # Unit tests diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 52fc58809604c..8129441b41fa1 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -39,10 +39,8 @@ #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" -#include "arrow/compute/kernels/boolean.h" -#include "arrow/compute/kernels/cast.h" -#include "arrow/compute/kernels/hash.h" #include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" using std::shared_ptr; using std::vector; @@ -50,1551 +48,25 @@ using std::vector; namespace arrow { namespace compute { -class ComputeFixture { - public: - ComputeFixture() : ctx_(default_memory_pool()) {} - - protected: - FunctionContext ctx_; -}; - -template -shared_ptr _MakeArray(const shared_ptr& type, const vector& values, - const vector& is_valid) { - shared_ptr result; - if (is_valid.size() > 0) { - ArrayFromVector(type, is_valid, values, &result); - } else { - ArrayFromVector(type, values, &result); - } - return result; -} - -// ---------------------------------------------------------------------- -// Cast - -static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { - ASSERT_EQ(left.data()->buffers[buffer_index].get(), - right.data()->buffers[buffer_index].get()); -} - -class TestCast : public ComputeFixture, public TestBase { - public: - void CheckPass(const Array& input, const Array& expected, - const shared_ptr& out_type, const CastOptions& options) { - shared_ptr result; - ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); - ASSERT_ARRAYS_EQUAL(expected, *result); - } - - template - void CheckFails(const shared_ptr& in_type, const vector& in_values, - const vector& is_valid, const shared_ptr& out_type, - const CastOptions& options) { - shared_ptr input, result; - if (is_valid.size() > 0) { - ArrayFromVector(in_type, is_valid, in_values, &input); - } else { - ArrayFromVector(in_type, in_values, &input); - } - ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); - } - - void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { - shared_ptr result; - ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); - AssertBufferSame(input, *result, 0); - AssertBufferSame(input, *result, 1); - } - - template - void CheckCase(const shared_ptr& in_type, const vector& in_values, - const vector& is_valid, const shared_ptr& out_type, - const vector& out_values, const CastOptions& options) { - DCHECK_EQ(in_values.size(), out_values.size()); - shared_ptr input, expected; - if (is_valid.size() > 0) { - DCHECK_EQ(is_valid.size(), out_values.size()); - ArrayFromVector(in_type, is_valid, in_values, &input); - ArrayFromVector(out_type, is_valid, out_values, &expected); - } else { - ArrayFromVector(in_type, in_values, &input); - ArrayFromVector(out_type, out_values, &expected); - } - CheckPass(*input, *expected, out_type, options); - - // Check a sliced variant - if (input->length() > 1) { - CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); - } - } -}; - -TEST_F(TestCast, SameTypeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; - - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - shared_ptr result; - ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); - - AssertBufferSame(*arr, *result, 0); - AssertBufferSame(*arr, *result, 1); -} - -TEST_F(TestCast, ToBoolean) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // int8, should suffice for other integers - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {false, true, true, true, false}; - CheckCase(int8(), v1, is_valid, boolean(), e1, - options); - - // floating point - vector v2 = {1.0, 0, 0, -1.0, 5.0}; - vector e2 = {true, false, false, true, true}; - CheckCase(float64(), v2, is_valid, boolean(), e2, - options); -} - -TEST_F(TestCast, ToIntUpcast) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - // int8 to int32 - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {0, 1, 127, -1, 0}; - CheckCase(int8(), v1, is_valid, int32(), e1, - options); - - // bool to int8 - vector v2 = {false, true, false, true, true}; - vector e2 = {0, 1, 0, 1, 1}; - CheckCase(boolean(), v2, is_valid, int8(), e2, - options); - - // uint8 to int16, no overflow/underrun - vector v3 = {0, 100, 200, 255, 0}; - vector e3 = {0, 100, 200, 255, 0}; - CheckCase(uint8(), v3, is_valid, int16(), e3, - options); -} - -TEST_F(TestCast, OverflowInNullSlot) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - vector v11 = {0, 70000, 2000, 1000, 0}; - vector e11 = {0, 0, 2000, 1000, 0}; - - shared_ptr expected; - ArrayFromVector(int16(), is_valid, e11, &expected); - - auto buf = Buffer::Wrap(v11.data(), v11.size()); - Int32Array tmp11(5, buf, expected->null_bitmap(), -1); - - CheckPass(tmp11, *expected, int16(), options); -} - -TEST_F(TestCast, ToIntDowncastSafe) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, uint8(), e1, - options); - - // int16 to uint8, with overflow - vector v2 = {0, 100, 256, 0, 0}; - CheckFails(int16(), v2, is_valid, uint8(), options); - - // underflow - vector v3 = {0, 100, -1, 0, 0}; - CheckFails(int16(), v3, is_valid, uint8(), options); - - // int32 to int16, no overflow - vector v4 = {0, 1000, 2000, 1, 2}; - vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(int32(), v4, is_valid, int16(), e4, - options); - - // int32 to int16, overflow - vector v5 = {0, 1000, 2000, 70000, 0}; - CheckFails(int32(), v5, is_valid, int16(), options); - - // underflow - vector v6 = {0, 1000, 2000, -70000, 0}; - CheckFails(int32(), v6, is_valid, int16(), options); - - vector v7 = {0, 1000, 2000, -70000, 0}; - CheckFails(int32(), v7, is_valid, uint8(), options); -} - -template -std::vector UnsafeVectorCast(const std::vector& v) { - size_t n_elems = v.size(); - std::vector result(n_elems); - - for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); - - return std::move(result); -} - -TEST_F(TestCast, IntegerSignedToUnsigned) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; - - // Same width - CheckFails(int32(), v1, is_valid, uint32(), options); - // Wider - CheckFails(int32(), v1, is_valid, uint64(), options); - // Narrower - CheckFails(int32(), v1, is_valid, uint16(), options); - // Fail because of overflow (instead of underflow). - vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; - CheckFails(int32(), over, is_valid, uint16(), options); - - options.allow_int_overflow = true; - - CheckCase( - int32(), v1, is_valid, uint32(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), v1, is_valid, uint64(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), v1, is_valid, uint16(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), over, is_valid, uint16(), UnsafeVectorCast(over), - options); -} - -TEST_F(TestCast, IntegerUnsignedToSigned) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, true, true}; - - vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; - vector v2 = {0, INT16_MAX + 1, 2}; - // Same width - CheckFails(uint32(), v1, is_valid, int32(), options); - // Narrower - CheckFails(uint32(), v1, is_valid, int16(), options); - CheckFails(uint32(), v2, is_valid, int16(), options); - - options.allow_int_overflow = true; - - CheckCase( - uint32(), v1, is_valid, int32(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v1, is_valid, int64(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v1, is_valid, int16(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v2, is_valid, int16(), UnsafeVectorCast(v2), options); -} - -TEST_F(TestCast, ToIntDowncastUnsafe) { - CastOptions options; - options.allow_int_overflow = true; - - vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, uint8(), e1, - options); - - // int16 to uint8, with overflow - vector v2 = {0, 100, 256, 0, 0}; - vector e2 = {0, 100, 0, 0, 0}; - CheckCase(int16(), v2, is_valid, uint8(), e2, - options); - - // underflow - vector v3 = {0, 100, -1, 0, 0}; - vector e3 = {0, 100, 255, 0, 0}; - CheckCase(int16(), v3, is_valid, uint8(), e3, - options); - - // int32 to int16, no overflow - vector v4 = {0, 1000, 2000, 1, 2}; - vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(int32(), v4, is_valid, int16(), e4, - options); - - // int32 to int16, overflow - // TODO(wesm): do we want to allow this? we could set to null - vector v5 = {0, 1000, 2000, 70000, 0}; - vector e5 = {0, 1000, 2000, 4464, 0}; - CheckCase(int32(), v5, is_valid, int16(), e5, - options); - - // underflow - // TODO(wesm): do we want to allow this? we could set overflow to null - vector v6 = {0, 1000, 2000, -70000, 0}; - vector e6 = {0, 1000, 2000, -4464, 0}; - CheckCase(int32(), v6, is_valid, int16(), e6, - options); -} - -TEST_F(TestCast, FloatingPointToInt) { - // which means allow_float_truncate == false - auto options = CastOptions::Safe(); - - vector is_valid = {true, false, true, true, true}; - vector all_valid = {true, true, true, true, true}; - - // float32 to int32 no truncation - vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e1 = {1, 0, 0, -1, 5}; - CheckCase(float32(), v1, is_valid, int32(), e1, - options); - CheckCase(float32(), v1, all_valid, int32(), e1, - options); - - // float64 to int32 no truncation - vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e2 = {1, 0, 0, -1, 5}; - CheckCase(float64(), v2, is_valid, int32(), e2, - options); - CheckCase(float64(), v2, all_valid, int32(), e2, - options); - - // float64 to int64 no truncation - vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e3 = {1, 0, 0, -1, 5}; - CheckCase(float64(), v3, is_valid, int64(), e3, - options); - CheckCase(float64(), v3, all_valid, int64(), e3, - options); - - // float64 to int32 truncate - vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; - vector e4 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(float64(), v4, is_valid, int32(), options); - CheckFails(float64(), v4, all_valid, int32(), options); - - options.allow_float_truncate = true; - CheckCase(float64(), v4, is_valid, int32(), e4, - options); - CheckCase(float64(), v4, all_valid, int32(), e4, - options); - - // float64 to int64 truncate - vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; - vector e5 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(float64(), v5, is_valid, int64(), options); - CheckFails(float64(), v5, all_valid, int64(), options); - - options.allow_float_truncate = true; - CheckCase(float64(), v5, is_valid, int64(), e5, - options); - CheckCase(float64(), v5, all_valid, int64(), e5, - options); -} - -TEST_F(TestCast, IntToFloatingPoint) { - auto options = CastOptions::Safe(); - - vector all_valid = {true, true, true, true, true}; - vector all_invalid = {false, false, false, false, false}; - - vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; - CheckFails(int64(), v1, all_valid, float32(), options); - - // While it's not safe to convert, all values are null. - CheckCase(int64(), v1, all_invalid, float64(), - UnsafeVectorCast(v1), - options); -} - -TEST_F(TestCast, TimestampToTimestamp) { - CastOptions options; - - auto CheckTimestampCast = - [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, - const vector& from_values, const vector& to_values, - const vector& is_valid) { - CheckCase( - timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, - options); - }; - - vector is_valid = {true, false, true, true, true}; - - // Multiply promotions - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); - - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); - - vector v3 = {0, 100, 200, 1, 2}; - vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); - - vector v4 = {0, 100, 200, 1, 2}; - vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); - - vector v5 = {0, 100, 200, 1, 2}; - vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); - - vector v6 = {0, 100, 200, 1, 2}; - vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); - - // Zero copy - vector v7 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, - &arr); - CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); - - // ARROW-1773, cast to integer - CheckZeroCopy(*arr, int64()); - - // Divide, truncate - vector v8 = {0, 100123, 200456, 1123, 2456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); - - vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - vector e9 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); - - vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - vector e10 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, - timestamp(TimeUnit::SECOND), options); - CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, - timestamp(TimeUnit::MILLI), options); - CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, - timestamp(TimeUnit::MICRO), options); - CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, - timestamp(TimeUnit::SECOND), options); - CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, - timestamp(TimeUnit::MILLI), options); - CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, - timestamp(TimeUnit::SECOND), options); -} - -TEST_F(TestCast, TimestampToDate32_Date64) { - CastOptions options; - - vector is_valid = {true, true, false}; - - // 2000-01-01, 2000-01-02, null - vector v_nano = {946684800000000000, 946771200000000000, 0}; - vector v_micro = {946684800000000, 946771200000000, 0}; - vector v_milli = {946684800000, 946771200000, 0}; - vector v_second = {946684800, 946771200, 0}; - vector v_day = {10957, 10958, 0}; - - // Simple conversions - CheckCase( - timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); - - CheckCase( - timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); - - // Disallow truncate, failures - vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; - vector v_micro_fail = {946684800000001, 946771200000001, 0}; - vector v_milli_fail = {946684800001, 946771200001, 0}; - vector v_second_fail = {946684801, 946771201, 0}; - - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date64(), options); - - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date32(), options); - - // Make sure that nulls are excluded from the truncation checks - vector v_second_nofail = {946684800, 946771200, 1}; - CheckCase( - timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); -} - -TEST_F(TestCast, TimeToCompatible) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // Multiply promotions - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time32(TimeUnit::SECOND), v1, is_valid, time32(TimeUnit::MILLI), e1, options); - - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase( - time32(TimeUnit::SECOND), v2, is_valid, time64(TimeUnit::MICRO), e2, options); - - vector v3 = {0, 100, 200, 1, 2}; - vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckCase( - time32(TimeUnit::SECOND), v3, is_valid, time64(TimeUnit::NANO), e3, options); - - vector v4 = {0, 100, 200, 1, 2}; - vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time32(TimeUnit::MILLI), v4, is_valid, time64(TimeUnit::MICRO), e4, options); - - vector v5 = {0, 100, 200, 1, 2}; - vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase( - time32(TimeUnit::MILLI), v5, is_valid, time64(TimeUnit::NANO), e5, options); - - vector v6 = {0, 100, 200, 1, 2}; - vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); - - // Zero copy - vector v7 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int64()); - - vector v7_2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int32()); - - // Divide, truncate - vector v8 = {0, 100123, 200456, 1123, 2456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckCase( - time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), e8, options); - CheckCase( - time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), e8, options); - CheckCase( - time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), e8, options); - - vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - vector e9 = {0, 100, 200, 1, 2}; - CheckCase( - time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), e9, options); - CheckCase( - time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), e9, options); - - vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - vector e10 = {0, 100, 200, 1, 2}; - CheckCase( - time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), e10, options); - - // Disallow truncate, failures - - options.allow_time_truncate = false; - CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), - options); - CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), - options); - CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), - options); - CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), - options); - CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), - options); - CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), - options); -} - -TEST_F(TestCast, PrimitiveZeroCopy) { - shared_ptr arr; - - ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint8()); - ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int8()); - - ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint16()); - ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int16()); - - ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint32()); - ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint64()); - ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int64()); - - ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float32()); - - ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float64()); -} - -TEST_F(TestCast, DateToCompatible) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - constexpr int64_t F = 86400000; - - // Multiply promotion - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; - CheckCase(date32(), v1, is_valid, date64(), - e1, options); - - // Zero copy - vector v2 = {0, 70000, 2000, 1000, 0}; - vector v3 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(date32(), is_valid, v2, &arr); - CheckZeroCopy(*arr, date32()); - - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(date64(), is_valid, v3, &arr); - CheckZeroCopy(*arr, date64()); - - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int64()); - - // Divide, truncate - vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckCase(date64(), v8, is_valid, date32(), - e8, options); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(date64(), v8, is_valid, date32(), options); -} - -TEST_F(TestCast, ToDouble) { - CastOptions options; - vector is_valid = {true, false, true, true, true}; - - // int16 to double - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, float64(), e1, - options); - - // float to double - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100, 200, 1, 2}; - CheckCase(float32(), v2, is_valid, float64(), e2, - options); - - // bool to double - vector v3 = {true, true, false, false, true}; - vector e3 = {1, 1, 0, 0, 1}; - CheckCase(boolean(), v3, is_valid, float64(), e3, - options); -} - -TEST_F(TestCast, ChunkedArray) { - vector values1 = {0, 1, 2}; - vector values2 = {3, 4, 5}; - - auto type = int16(); - auto out_type = int64(); - - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); - - CastOptions options; - - Datum out; - ASSERT_OK(Cast(&this->ctx_, Datum(carr), out_type, options, &out)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); - - auto out_carr = out.chunked_array(); - - vector ex_values1 = {0, 1, 2}; - vector ex_values2 = {3, 4, 5}; - auto a3 = _MakeArray(out_type, ex_values1, {}); - auto a4 = _MakeArray(out_type, ex_values2, {}); - - ArrayVector ex_arrays = {a3, a4}; - auto ex_carr = std::make_shared(ex_arrays); - - ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); -} - -TEST_F(TestCast, UnsupportedTarget) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; - - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - shared_ptr result; - ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); -} - -TEST_F(TestCast, DateTimeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - - vector v1 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - CheckZeroCopy(*arr, date32()); - - vector v2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int64(), is_valid, v2, &arr); - - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - CheckZeroCopy(*arr, date64()); - CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); -} - -TEST_F(TestCast, FromNull) { - // Null casts to everything - const int length = 10; - - NullArray arr(length); - - shared_ptr result; - ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); - - ASSERT_EQ(length, result->length()); - ASSERT_EQ(length, result->null_count()); - - // OK to look at bitmaps - ASSERT_ARRAYS_EQUAL(*result, *result); -} - -TEST_F(TestCast, PreallocatedMemory) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - const int64_t length = 5; - - shared_ptr arr; - vector v1 = {0, 70000, 2000, 1000, 0}; - vector e1 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int32(), is_valid, v1, &arr); - - auto out_type = int64(); - - std::unique_ptr kernel; - ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); - - auto out_data = ArrayData::Make(out_type, length); - - shared_ptr out_values; - ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); - - out_data->buffers.push_back(nullptr); - out_data->buffers.push_back(out_values); - - Datum out(out_data); - ASSERT_OK(kernel->Call(&this->ctx_, Datum(arr), &out)); - - // Buffer address unchanged - ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); - - shared_ptr result = MakeArray(out_data); - shared_ptr expected; - ArrayFromVector(int64(), is_valid, e1, &expected); - - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -template -void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, - const vector& in_values, - const std::shared_ptr& out_type, - const vector& out_values) { - using OutTraits = TypeTraits; - - CastOptions options; - - const int64_t length = static_cast(in_values.size()); - - shared_ptr arr, expected; - ArrayFromVector(in_type, in_values, &arr); - ArrayFromVector(out_type, out_values, &expected); - - shared_ptr out_buffer; - ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); - - std::unique_ptr kernel; - ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); - - const int64_t first_half = length / 2; - - auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); - auto out_second_data = out_data->Copy(); - out_second_data->offset = first_half; - - Datum out_first(out_data); - Datum out_second(out_second_data); - - // Cast each bit - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first)); - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second)); - - shared_ptr result = MakeArray(out_data); - - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -TEST_F(TestCast, OffsetOutputBuffer) { - // ARROW-1735 - vector v1 = {0, 10000, 2000, 1000, 0}; - vector e1 = {0, 10000, 2000, 1000, 0}; - - auto in_type = int32(); - auto out_type = int64(); - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - out_type, e1); - - vector e2 = {false, true, true, true, false}; - - out_type = boolean(); - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - boolean(), e2); - - vector e3 = {0, 10000, 2000, 1000, 0}; - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - int16(), e3); -} - -TEST_F(TestCast, StringToBoolean) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - vector v1 = {"False", "true", "true", "True", "false"}; - vector v2 = {"0", "1", "1", "1", "0"}; - vector e = {false, true, true, true, false}; - CheckCase(utf8(), v1, is_valid, boolean(), - e, options); - CheckCase(utf8(), v2, is_valid, boolean(), - e, options); -} - -TEST_F(TestCast, StringToBooleanErrors) { - CastOptions options; - - vector is_valid = {true}; - - CheckFails(utf8(), {"false "}, is_valid, boolean(), options); - CheckFails(utf8(), {"T"}, is_valid, boolean(), options); -} - -TEST_F(TestCast, StringToNumber) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // string to int - vector v_int = {"0", "1", "127", "-1", "0"}; - vector e_int8 = {0, 1, 127, -1, 0}; - vector e_int16 = {0, 1, 127, -1, 0}; - vector e_int32 = {0, 1, 127, -1, 0}; - vector e_int64 = {0, 1, 127, -1, 0}; - CheckCase(utf8(), v_int, is_valid, int8(), - e_int8, options); - CheckCase(utf8(), v_int, is_valid, int16(), - e_int16, options); - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - v_int = {"2147483647", "0", "-2147483648", "0", "0"}; - e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; - e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - // string to uint - vector v_uint = {"0", "1", "127", "255", "0"}; - vector e_uint8 = {0, 1, 127, 255, 0}; - vector e_uint16 = {0, 1, 127, 255, 0}; - vector e_uint32 = {0, 1, 127, 255, 0}; - vector e_uint64 = {0, 1, 127, 255, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint8(), e_uint8, options); - CheckCase(utf8(), v_uint, is_valid, - uint16(), e_uint16, options); - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - v_uint = {"4294967295", "0", "0", "0", "0"}; - e_uint32 = {4294967295, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - v_uint = {"18446744073709551615", "0", "0", "0", "0"}; - e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - // string to float - vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; - vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; - vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - - // Test that casting is locale-independent - auto global_locale = std::locale(); - try { - // French locale uses the comma as decimal point - std::locale::global(std::locale("fr_FR.UTF-8")); - } catch (std::runtime_error&) { - // Locale unavailable, ignore - } - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - std::locale::global(global_locale); -} - -TEST_F(TestCast, StringToNumberErrors) { - CastOptions options; - - vector is_valid = {true}; - - CheckFails(utf8(), {"z"}, is_valid, int8(), options); - CheckFails(utf8(), {"12 z"}, is_valid, int8(), options); - CheckFails(utf8(), {"128"}, is_valid, int8(), options); - CheckFails(utf8(), {"-129"}, is_valid, int8(), options); - CheckFails(utf8(), {"0.5"}, is_valid, int8(), options); - - CheckFails(utf8(), {"256"}, is_valid, uint8(), options); - CheckFails(utf8(), {"-1"}, is_valid, uint8(), options); - - CheckFails(utf8(), {"z"}, is_valid, float32(), options); -} - -TEST_F(TestCast, StringToTimestamp) { - CastOptions options; - - vector is_valid = {true, false, true}; - vector strings = {"1970-01-01", "xxx", "2000-02-29"}; - - auto type = timestamp(TimeUnit::SECOND); - vector e = {0, 0, 951782400}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - type = timestamp(TimeUnit::MICRO); - e = {0, 0, 951782400000000LL}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc -} - -TEST_F(TestCast, StringToTimestampErrors) { - CastOptions options; - - vector is_valid = {true}; - - for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { - auto type = timestamp(unit); - CheckFails(utf8(), {""}, is_valid, type, options); - CheckFails(utf8(), {"xxx"}, is_valid, type, options); - } -} - -template -class TestDictionaryCast : public TestCast {}; - -typedef ::testing::Types - TestTypes; - -TYPED_TEST_CASE(TestDictionaryCast, TestTypes); - -TYPED_TEST(TestDictionaryCast, Basic) { - CastOptions options; - shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - - Datum out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(plain_array->data()), &out)); - - this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); -} - -TEST_F(TestCast, DictToNonDictNoNulls) { - vector dict_values = {"foo", "bar", "baz"}; - auto ex_dict = _MakeArray(utf8(), dict_values, {}); - auto dict_type = dictionary(int32(), ex_dict); - - // Explicitly construct with nullptr for the null_bitmap_data - std::vector i1 = {1, 0, 1}; - std::vector i2 = {2, 1, 0, 1}; - auto c1 = std::make_shared>(3, Buffer::Wrap(i1)); - auto c2 = std::make_shared>(4, Buffer::Wrap(i2)); - - ArrayVector dict_arrays = {std::make_shared(dict_type, c1), - std::make_shared(dict_type, c2)}; - auto dict_carr = std::make_shared(dict_arrays); - - Datum cast_input(dict_carr); - Datum cast_output; - // Ensure that casting works even when the null_bitmap_data array is a nullptr - ASSERT_OK(Cast(&this->ctx_, cast_input, - static_cast(*dict_type).dictionary()->type(), - CastOptions(), &cast_output)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, cast_output.kind()); - - auto e1 = _MakeArray(utf8(), {"bar", "foo", "bar"}, {}); - auto e2 = _MakeArray(utf8(), {"baz", "bar", "foo", "bar"}, {}); - - auto chunks = cast_output.chunked_array()->chunks(); - ASSERT_EQ(chunks.size(), 2); - ASSERT_ARRAYS_EQUAL(*e1, *chunks[0]); - ASSERT_ARRAYS_EQUAL(*e2, *chunks[1]); -} - -/*TYPED_TEST(TestDictionaryCast, Reverse) { - CastOptions options; - shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - - shared_ptr dict_array; - ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); - - this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); -}*/ - -TEST_F(TestCast, ListToList) { - CastOptions options; - std::shared_ptr offsets; - - vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; - std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; - ArrayFromVector(offsets_is_valid, offsets_values, &offsets); - - shared_ptr int32_plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - std::shared_ptr int32_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); - - std::shared_ptr int64_plain_array; - ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); - std::shared_ptr int64_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); - - std::shared_ptr float64_plain_array; - ASSERT_OK( - Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); - std::shared_ptr float64_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); - - CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options); - CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); - CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options); - CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options); - - options.allow_float_truncate = true; - CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); - CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); -} - // ---------------------------------------------------------------------- -// Dictionary tests - -template -void CheckUnique(FunctionContext* ctx, const shared_ptr& type, - const vector& in_values, const vector& in_is_valid, - const vector& out_values, const vector& out_is_valid) { - shared_ptr input = _MakeArray(type, in_values, in_is_valid); - shared_ptr expected = _MakeArray(type, out_values, out_is_valid); - - shared_ptr result; - ASSERT_OK(Unique(ctx, Datum(input), &result)); - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -template -void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, - const vector& in_values, const vector& in_is_valid, - const vector& out_values, const vector& out_is_valid, - const vector& out_indices) { - shared_ptr input = _MakeArray(type, in_values, in_is_valid); - shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); - shared_ptr ex_indices = - _MakeArray(int32(), out_indices, in_is_valid); - - DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); - - Datum datum_out; - ASSERT_OK(DictionaryEncode(ctx, Datum(input), &datum_out)); - shared_ptr result = MakeArray(datum_out.array()); - - ASSERT_ARRAYS_EQUAL(expected, *result); -} - -class TestHashKernel : public ComputeFixture, public TestBase {}; - -template -class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; - -typedef ::testing::Types - PrimitiveDictionaries; - -TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); - -TYPED_TEST(TestHashKernelPrimitive, Unique) { - using T = typename TypeParam::c_type; - auto type = TypeTraits::type_singleton(); - CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, - {2, 1}, {}); - CheckUnique(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, - {3, 1}, {}); -} - -TYPED_TEST(TestHashKernelPrimitive, DictEncode) { - using T = typename TypeParam::c_type; - auto type = TypeTraits::type_singleton(); - CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, - {true, false, true, true, true, true}, {2, 1, 3}, {}, - {0, 0, 0, 1, 0, 2}); -} - -TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { - using T = typename TypeParam::c_type; - // Skip this test for (u)int8 - if (sizeof(Scalar) == 1) { - return; - } - - const int64_t kTotalValues = 1000000; - const int64_t kRepeats = 5; - - vector values; - vector uniques; - vector indices; - for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { - const auto val = static_cast(i % kTotalValues); - values.push_back(val); - - if (i < kTotalValues) { - uniques.push_back(val); - } - indices.push_back(static_cast(i % kTotalValues)); - } - - auto type = TypeTraits::type_singleton(); - CheckUnique(&this->ctx_, type, values, {}, uniques, {}); - - CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); -} - -TEST_F(TestHashKernel, UniqueTimeTimestamp) { - CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); - - CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); - - CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), - {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, - {}); -} - -TEST_F(TestHashKernel, UniqueBoolean) { - CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, - {true, false, true, true}, {true, false}, {}); - - CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, - {true, false, true, true}, {false, true}, {}); - - // No nulls - CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, - {true, false}, {}); - - CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, - {false, true}, {}); -} - -TEST_F(TestHashKernel, DictEncodeBoolean) { - CheckDictEncode( - &this->ctx_, boolean(), {true, true, false, true, false}, - {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); - - CheckDictEncode( - &this->ctx_, boolean(), {false, true, false, true, false}, - {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); - - // No nulls - CheckDictEncode(&this->ctx_, boolean(), - {true, true, false, true, false}, {}, {true, false}, - {}, {0, 0, 1, 0, 1}); - - CheckDictEncode(&this->ctx_, boolean(), - {false, true, false, true, false}, {}, {false, true}, - {}, {0, 1, 0, 1, 0}); -} - -TEST_F(TestHashKernel, UniqueBinary) { - CheckUnique(&this->ctx_, binary(), - {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "test2"}, {}); - - CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "test2"}, {}); -} - -TEST_F(TestHashKernel, DictEncodeBinary) { - CheckDictEncode( - &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, - {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); - - CheckDictEncode( - &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, - {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); -} - -TEST_F(TestHashKernel, BinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif - - vector values; - vector uniques; - vector indices; - char buf[20] = "test"; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); - values.emplace_back(buf); - - if (i < kTotalValues) { - uniques.push_back(values.back()); - } - indices.push_back(index); - } - - CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); - CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, - indices); - - CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); - CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, - indices); -} - -TEST_F(TestHashKernel, UniqueFixedSizeBinary) { - CheckUnique( - &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, - {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); -} +// Datum -TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { - CheckDictEncode( - &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); +template +void CheckImplicitConstructor(enum Datum::type expected_kind) { + std::shared_ptr value; + Datum datum = value; + ASSERT_EQ(expected_kind, datum.kind()); } -TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif +TEST(TestDatum, ImplicitConstructors) { + CheckImplicitConstructor(Datum::ARRAY); - vector values; - vector uniques; - vector indices; - char buf[7] = "test.."; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - buf[4] = static_cast(index / 128); - buf[5] = static_cast(index % 128); - values.emplace_back(buf, 6); - - if (i < kTotalValues) { - uniques.push_back(values.back()); - } - indices.push_back(index); - } - - auto type = fixed_size_binary(6); - CheckUnique(&this->ctx_, type, values, {}, uniques, - {}); - CheckDictEncode(&this->ctx_, type, values, {}, - uniques, {}, indices); -} - -TEST_F(TestHashKernel, UniqueDecimal) { - vector values{12, 12, 11, 12}; - vector expected{12, 11}; - - CheckUnique(&this->ctx_, decimal(2, 0), values, - {true, false, true, true}, expected, {}); -} - -TEST_F(TestHashKernel, DictEncodeDecimal) { - vector values{12, 12, 11, 12, 13}; - vector expected{12, 11, 13}; - - CheckDictEncode(&this->ctx_, decimal(2, 0), values, - {true, false, true, true, true}, expected, - {}, {0, 0, 1, 0, 2}); -} - -TEST_F(TestHashKernel, ChunkedArrayInvoke) { - vector values1 = {"foo", "bar", "foo"}; - vector values2 = {"bar", "baz", "quuux", "foo"}; - - auto type = utf8(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - vector dict_values = {"foo", "bar", "baz", "quuux"}; - auto ex_dict = _MakeArray(type, dict_values, {}); - - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); - - // Unique - shared_ptr result; - ASSERT_OK(Unique(&this->ctx_, Datum(carr), &result)); - ASSERT_ARRAYS_EQUAL(*ex_dict, *result); - - // Dictionary encode - auto dict_type = dictionary(int32(), ex_dict); - - auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); - auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); - - ArrayVector dict_arrays = {std::make_shared(dict_type, i1), - std::make_shared(dict_type, i2)}; - auto dict_carr = std::make_shared(dict_arrays); - - Datum encoded_out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(carr), &encoded_out)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); - - AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); -} - -using BinaryKernelFunc = - std::function; - -class TestBooleanKernel : public ComputeFixture, public TestBase { - public: - void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - Datum result; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - std::shared_ptr result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(expected)); - } - - void TestChunkedArrayBinary(const BinaryKernelFunc& kernel, - const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - Datum result; - std::shared_ptr result_array; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); - std::shared_ptr result_ca = result.chunked_array(); - ASSERT_TRUE(result_ca->Equals(expected)); - } - - void TestBinaryKernel(const BinaryKernelFunc& kernel, const std::vector& values1, - const std::vector& values2, - const std::vector& values3, - const std::vector& values3_nulls) { - auto type = boolean(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - auto a3 = _MakeArray(type, values3, {}); - auto a1_nulls = _MakeArray(type, values1, values1); - auto a2_nulls = _MakeArray(type, values2, values2); - auto a3_nulls = _MakeArray(type, values3, values3_nulls); - - TestArrayBinary(kernel, a1, a2, a3); - TestArrayBinary(kernel, a1_nulls, a2_nulls, a3_nulls); - TestArrayBinary(kernel, a1->Slice(1), a2->Slice(1), a3->Slice(1)); - TestArrayBinary(kernel, a1_nulls->Slice(1), a2_nulls->Slice(1), a3_nulls->Slice(1)); - - // ChunkedArray - std::vector> ca1_arrs = {a1, a1->Slice(1)}; - auto ca1 = std::make_shared(ca1_arrs); - std::vector> ca2_arrs = {a2, a2->Slice(1)}; - auto ca2 = std::make_shared(ca2_arrs); - std::vector> ca3_arrs = {a3, a3->Slice(1)}; - auto ca3 = std::make_shared(ca3_arrs); - TestChunkedArrayBinary(kernel, ca1, ca2, ca3); - - // ChunkedArray with different chunks - std::vector> ca4_arrs = {a1->Slice(0, 1), a1->Slice(1), - a1->Slice(1, 1), a1->Slice(2)}; - auto ca4 = std::make_shared(ca4_arrs); - TestChunkedArrayBinary(kernel, ca4, ca2, ca3); - } -}; - -TEST_F(TestBooleanKernel, Invert) { - vector values1 = {true, false, true}; - vector values2 = {false, true, false}; - - auto type = boolean(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - // Plain array - Datum result; - ASSERT_OK(Invert(&this->ctx_, Datum(a1), &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - std::shared_ptr result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(a2)); - - // Array with offset - ASSERT_OK(Invert(&this->ctx_, Datum(a1->Slice(1)), &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(a2->Slice(1))); - - // ChunkedArray - std::vector> ca1_arrs = {a1, a1->Slice(1)}; - auto ca1 = std::make_shared(ca1_arrs); - std::vector> ca2_arrs = {a2, a2->Slice(1)}; - auto ca2 = std::make_shared(ca2_arrs); - ASSERT_OK(Invert(&this->ctx_, Datum(ca1), &result)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); - std::shared_ptr result_ca = result.chunked_array(); - ASSERT_TRUE(result_ca->Equals(ca2)); -} - -TEST_F(TestBooleanKernel, And) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {true, false, false, false, true, false}; - TestBinaryKernel(And, values1, values2, values3, values3); -} - -TEST_F(TestBooleanKernel, Or) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {true, true, true, false, true, true}; - vector values3_nulls = {true, false, false, false, true, false}; - TestBinaryKernel(Or, values1, values2, values3, values3_nulls); -} + // Instantiate from array subclass + CheckImplicitConstructor(Datum::ARRAY); -TEST_F(TestBooleanKernel, Xor) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {false, true, true, false, false, true}; - vector values3_nulls = {true, false, false, false, true, false}; - TestBinaryKernel(Xor, values1, values2, values3, values3_nulls); + CheckImplicitConstructor(Datum::CHUNKED_ARRAY); + CheckImplicitConstructor(Datum::RECORD_BATCH); + CheckImplicitConstructor(Datum::TABLE); } class TestInvokeBinaryKernel : public ComputeFixture, public TestBase {}; @@ -1618,14 +90,14 @@ TEST_F(TestInvokeBinaryKernel, Exceptions) { auto a2 = _MakeArray(type, values2, {}); // Left is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel( - &this->ctx_, &kernel, Datum(table), Datum(a2), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, table, a2, + &outputs)); // Right is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(table), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, table, + &outputs)); // Different sized inputs - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(a1->Slice(1)), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, + a1->Slice(1), &outputs)); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 8048fff75bc29..93bec75a026ba 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -19,6 +19,7 @@ #define ARROW_COMPUTE_KERNEL_H #include +#include #include #include "arrow/array.h" @@ -60,24 +61,41 @@ struct ARROW_EXPORT Datum { /// \brief Empty datum, to be populated elsewhere Datum() : value(NULLPTR) {} - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : Datum(value->data()) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr
& value) : value(value) {} - - explicit Datum(const std::vector& value) : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(value ? value->data() : NULLPTR) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr
& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::vector& value) // NOLINT implicit conversion + : value(value) {} + + // Cast from subtypes of Array to Datum + template ::value>::type> + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(std::shared_ptr(value)) {} ~Datum() {} Datum(const Datum& other) noexcept { this->value = other.value; } + // Define move constructor and move assignment, for better performance + Datum(Datum&& other) noexcept : value(std::move(other.value)) {} + + Datum& operator=(Datum&& other) noexcept { + value = std::move(other.value); + return *this; + } + Datum::type kind() const { switch (this->value.which()) { case 0: @@ -133,9 +151,24 @@ struct ARROW_EXPORT Datum { }; /// \class UnaryKernel -/// \brief An array-valued function of a single input argument +/// \brief An function of a single input argument. +/// +/// Note to implementors: Try to avoid making kernels that allocate memory if +/// the output size is a deterministic function of the Input Datum's metadata. +/// Instead separate the logic of the kernel and allocations necessary into +/// two different kernels. Some reusable kernels that allocate buffers +/// and delegate computation to another kernel are available in util-internal.h. class ARROW_EXPORT UnaryKernel : public OpKernel { public: + /// \brief Executes the kernel. + /// + /// \param[in] ctx The function context for the kernel + /// \param[in] input The kernel input data + /// \param[out] out The output of the function. Each implementation of this + /// function might assume different things about the existing contents of out + /// (e.g. which buffers are preallocated). In the future it is expected that + /// there will be a more generic mechansim for understanding the necessary + /// contracts. virtual Status Call(FunctionContext* ctx, const Datum& input, Datum* out) = 0; }; diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 923c8c3bd4e81..4d508aacb9990 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -install(FILES - boolean.h - cast.h - hash.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute/kernels") +ARROW_INSTALL_ALL_HEADERS("arrow/compute/kernels") + +ADD_ARROW_TEST(boolean-test PREFIX "arrow-compute") +ADD_ARROW_TEST(cast-test PREFIX "arrow-compute") +ADD_ARROW_TEST(hash-test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/boolean-test.cc b/cpp/src/arrow/compute/kernels/boolean-test.cc new file mode 100644 index 0000000000000..24b3c68aa1cfb --- /dev/null +++ b/cpp/src/arrow/compute/kernels/boolean-test.cc @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +#include "arrow/test-common.h" +#include "arrow/test-util.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/boolean.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +using BinaryKernelFunc = + std::function; + +class TestBooleanKernel : public ComputeFixture, public TestBase { + public: + void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + Datum result; + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + std::shared_ptr result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(expected)); + } + + void TestChunkedArrayBinary(const BinaryKernelFunc& kernel, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + Datum result; + std::shared_ptr result_array; + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); + std::shared_ptr result_ca = result.chunked_array(); + ASSERT_TRUE(result_ca->Equals(expected)); + } + + void TestBinaryKernel(const BinaryKernelFunc& kernel, const std::vector& values1, + const std::vector& values2, + const std::vector& values3, + const std::vector& values3_nulls) { + auto type = boolean(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + auto a3 = _MakeArray(type, values3, {}); + auto a1_nulls = _MakeArray(type, values1, values1); + auto a2_nulls = _MakeArray(type, values2, values2); + auto a3_nulls = _MakeArray(type, values3, values3_nulls); + + TestArrayBinary(kernel, a1, a2, a3); + TestArrayBinary(kernel, a1_nulls, a2_nulls, a3_nulls); + TestArrayBinary(kernel, a1->Slice(1), a2->Slice(1), a3->Slice(1)); + TestArrayBinary(kernel, a1_nulls->Slice(1), a2_nulls->Slice(1), a3_nulls->Slice(1)); + + // ChunkedArray + std::vector> ca1_arrs = {a1, a1->Slice(1)}; + auto ca1 = std::make_shared(ca1_arrs); + std::vector> ca2_arrs = {a2, a2->Slice(1)}; + auto ca2 = std::make_shared(ca2_arrs); + std::vector> ca3_arrs = {a3, a3->Slice(1)}; + auto ca3 = std::make_shared(ca3_arrs); + TestChunkedArrayBinary(kernel, ca1, ca2, ca3); + + // ChunkedArray with different chunks + std::vector> ca4_arrs = {a1->Slice(0, 1), a1->Slice(1), + a1->Slice(1, 1), a1->Slice(2)}; + auto ca4 = std::make_shared(ca4_arrs); + TestChunkedArrayBinary(kernel, ca4, ca2, ca3); + } +}; + +TEST_F(TestBooleanKernel, Invert) { + vector values1 = {true, false, true}; + vector values2 = {false, true, false}; + + auto type = boolean(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + // Plain array + Datum result; + ASSERT_OK(Invert(&this->ctx_, a1, &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + std::shared_ptr result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(a2)); + + // Array with offset + ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(a2->Slice(1))); + + // ChunkedArray + std::vector> ca1_arrs = {a1, a1->Slice(1)}; + auto ca1 = std::make_shared(ca1_arrs); + std::vector> ca2_arrs = {a2, a2->Slice(1)}; + auto ca2 = std::make_shared(ca2_arrs); + ASSERT_OK(Invert(&this->ctx_, ca1, &result)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); + std::shared_ptr result_ca = result.chunked_array(); + ASSERT_TRUE(result_ca->Equals(ca2)); +} + +TEST_F(TestBooleanKernel, And) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {true, false, false, false, true, false}; + TestBinaryKernel(And, values1, values2, values3, values3); +} + +TEST_F(TestBooleanKernel, Or) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {true, true, true, false, true, true}; + vector values3_nulls = {true, false, false, false, true, false}; + TestBinaryKernel(Or, values1, values2, values3, values3_nulls); +} + +TEST_F(TestBooleanKernel, Xor) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {false, true, true, false, false, true}; + vector values3_nulls = {true, false, false, false, true, false}; + TestBinaryKernel(Xor, values1, values2, values3, values3_nulls); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/boolean.cc b/cpp/src/arrow/compute/kernels/boolean.cc index e1aa1669572d0..91a0e9344305c 100644 --- a/cpp/src/arrow/compute/kernels/boolean.cc +++ b/cpp/src/arrow/compute/kernels/boolean.cc @@ -43,34 +43,35 @@ namespace compute { class InvertKernel : public UnaryKernel { Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { DCHECK_EQ(Datum::ARRAY, input.kind()); + constexpr int64_t kZeroDestOffset = 0; const ArrayData& in_data = *input.array(); - ArrayData* result; - - out->value = ArrayData::Make(boolean(), in_data.length); - result = out->array().get(); + std::shared_ptr result = out->array(); + result->type = boolean(); - // Allocate or copy bitmap + // Handle validity bitmap result->null_count = in_data.null_count; - std::shared_ptr validity_bitmap = in_data.buffers[0]; + const std::shared_ptr& validity_bitmap = in_data.buffers[0]; if (in_data.offset != 0) { - RETURN_NOT_OK(CopyBitmap(ctx->memory_pool(), validity_bitmap->data(), - in_data.offset, in_data.length, &validity_bitmap)); + DCHECK_LE(BitUtil::BytesForBits(in_data.length), validity_bitmap->size()); + CopyBitmap(validity_bitmap->data(), in_data.offset, in_data.length, + result->buffers[0]->mutable_data(), kZeroDestOffset); + } else { + result->buffers[0] = validity_bitmap; } - result->buffers.push_back(validity_bitmap); - - // Allocate output data buffer - std::shared_ptr data_buffer; - RETURN_NOT_OK(InvertBitmap(ctx->memory_pool(), in_data.buffers[1]->data(), - in_data.offset, in_data.length, &data_buffer)); - result->buffers.push_back(data_buffer); + // Handle output data buffer + const std::shared_ptr& data_buffer = in_data.buffers[1]; + DCHECK_LE(BitUtil::BytesForBits(in_data.length), data_buffer->size()); + InvertBitmap(data_buffer->data(), in_data.offset, in_data.length, + result->buffers[1]->mutable_data(), kZeroDestOffset); return Status::OK(); } }; Status Invert(FunctionContext* ctx, const Datum& value, Datum* out) { - InvertKernel kernel; + detail::PrimitiveAllocatingUnaryKernel kernel( + std::unique_ptr(new InvertKernel())); std::vector result; RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, &kernel, value, &result)); diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc new file mode 100644 index 0000000000000..c3a0df5d8a73f --- /dev/null +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -0,0 +1,1232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/hash.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +static std::vector> kNumericTypes = { + uint8(), int8(), uint16(), int16(), uint32(), + int32(), uint64(), int64(), float32(), float64()}; + +static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { + ASSERT_EQ(left.data()->buffers[buffer_index].get(), + right.data()->buffers[buffer_index].get()); +} + +class TestCast : public ComputeFixture, public TestBase { + public: + void CheckPass(const Array& input, const Array& expected, + const shared_ptr& out_type, const CastOptions& options) { + shared_ptr result; + ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); + ASSERT_ARRAYS_EQUAL(expected, *result); + } + + template + void CheckFails(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const CastOptions& options) { + shared_ptr input, result; + if (is_valid.size() > 0) { + ArrayFromVector(in_type, is_valid, in_values, &input); + } else { + ArrayFromVector(in_type, in_values, &input); + } + ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); + } + + void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { + shared_ptr result; + ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); + ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size()); + for (size_t i = 0; i < input.data()->buffers.size(); ++i) { + AssertBufferSame(input, *result, static_cast(i)); + } + } + + template + void CheckCase(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const vector& out_values, const CastOptions& options) { + DCHECK_EQ(in_values.size(), out_values.size()); + shared_ptr input, expected; + if (is_valid.size() > 0) { + DCHECK_EQ(is_valid.size(), out_values.size()); + ArrayFromVector(in_type, is_valid, in_values, &input); + ArrayFromVector(out_type, is_valid, out_values, &expected); + } else { + ArrayFromVector(in_type, in_values, &input); + ArrayFromVector(out_type, out_values, &expected); + } + CheckPass(*input, *expected, out_type, options); + + // Check a sliced variant + if (input->length() > 1) { + CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); + } + } + + void CheckCaseJSON(const shared_ptr& in_type, + const shared_ptr& out_type, const std::string& in_json, + const std::string& expected_json, + const CastOptions& options = CastOptions()) { + shared_ptr input = ArrayFromJSON(in_type, in_json); + shared_ptr expected = ArrayFromJSON(out_type, expected_json); + DCHECK_EQ(input->length(), expected->length()); + CheckPass(*input, *expected, out_type, options); + + // Check a sliced variant + if (input->length() > 1) { + CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); + } + } +}; + +TEST_F(TestCast, SameTypeZeroCopy) { + shared_ptr arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]"); + shared_ptr result; + ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); + + AssertBufferSame(*arr, *result, 0); + AssertBufferSame(*arr, *result, 1); +} + +TEST_F(TestCast, FromBoolean) { + CastOptions options; + + vector is_valid(20, true); + is_valid[3] = false; + + vector v1(is_valid.size(), true); + vector e1(is_valid.size(), 1); + for (size_t i = 0; i < v1.size(); ++i) { + if (i % 3 == 1) { + v1[i] = false; + e1[i] = 0; + } + } + + CheckCase(boolean(), v1, is_valid, int32(), e1, + options); +} + +TEST_F(TestCast, ToBoolean) { + CastOptions options; + for (auto type : kNumericTypes) { + CheckCaseJSON(type, boolean(), "[0, null, 127, 1, 0]", + "[false, null, true, true, false]"); + } + + // Check negative numbers + CheckCaseJSON(int8(), boolean(), "[0, null, 127, -1, 0]", + "[false, null, true, true, false]"); + CheckCaseJSON(float64(), boolean(), "[0, null, 127, -1, 0]", + "[false, null, true, true, false]"); +} + +TEST_F(TestCast, ToIntUpcast) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + // int8 to int32 + vector v1 = {0, 1, 127, -1, 0}; + vector e1 = {0, 1, 127, -1, 0}; + CheckCase(int8(), v1, is_valid, int32(), e1, + options); + + // bool to int8 + vector v2 = {false, true, false, true, true}; + vector e2 = {0, 1, 0, 1, 1}; + CheckCase(boolean(), v2, is_valid, int8(), e2, + options); + + // uint8 to int16, no overflow/underrun + vector v3 = {0, 100, 200, 255, 0}; + vector e3 = {0, 100, 200, 255, 0}; + CheckCase(uint8(), v3, is_valid, int16(), e3, + options); +} + +TEST_F(TestCast, OverflowInNullSlot) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + vector v11 = {0, 70000, 2000, 1000, 0}; + vector e11 = {0, 0, 2000, 1000, 0}; + + shared_ptr expected; + ArrayFromVector(int16(), is_valid, e11, &expected); + + auto buf = Buffer::Wrap(v11.data(), v11.size()); + Int32Array tmp11(5, buf, expected->null_bitmap(), -1); + + CheckPass(tmp11, *expected, int16(), options); +} + +TEST_F(TestCast, ToIntDowncastSafe) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + // int16 to uint8, no overflow/underrun + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, uint8(), e1, + options); + + // int16 to uint8, with overflow + vector v2 = {0, 100, 256, 0, 0}; + CheckFails(int16(), v2, is_valid, uint8(), options); + + // underflow + vector v3 = {0, 100, -1, 0, 0}; + CheckFails(int16(), v3, is_valid, uint8(), options); + + // int32 to int16, no overflow + vector v4 = {0, 1000, 2000, 1, 2}; + vector e4 = {0, 1000, 2000, 1, 2}; + CheckCase(int32(), v4, is_valid, int16(), e4, + options); + + // int32 to int16, overflow + vector v5 = {0, 1000, 2000, 70000, 0}; + CheckFails(int32(), v5, is_valid, int16(), options); + + // underflow + vector v6 = {0, 1000, 2000, -70000, 0}; + CheckFails(int32(), v6, is_valid, int16(), options); + + vector v7 = {0, 1000, 2000, -70000, 0}; + CheckFails(int32(), v7, is_valid, uint8(), options); +} + +template +std::vector UnsafeVectorCast(const std::vector& v) { + size_t n_elems = v.size(); + std::vector result(n_elems); + + for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); + + return std::move(result); +} + +TEST_F(TestCast, IntegerSignedToUnsigned) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; + + // Same width + CheckFails(int32(), v1, is_valid, uint32(), options); + // Wider + CheckFails(int32(), v1, is_valid, uint64(), options); + // Narrower + CheckFails(int32(), v1, is_valid, uint16(), options); + // Fail because of overflow (instead of underflow). + vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; + CheckFails(int32(), over, is_valid, uint16(), options); + + options.allow_int_overflow = true; + + CheckCase( + int32(), v1, is_valid, uint32(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), v1, is_valid, uint64(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), v1, is_valid, uint16(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), over, is_valid, uint16(), UnsafeVectorCast(over), + options); +} + +TEST_F(TestCast, IntegerUnsignedToSigned) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, true, true}; + + vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; + vector v2 = {0, INT16_MAX + 1, 2}; + // Same width + CheckFails(uint32(), v1, is_valid, int32(), options); + // Narrower + CheckFails(uint32(), v1, is_valid, int16(), options); + CheckFails(uint32(), v2, is_valid, int16(), options); + + options.allow_int_overflow = true; + + CheckCase( + uint32(), v1, is_valid, int32(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v1, is_valid, int64(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v1, is_valid, int16(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v2, is_valid, int16(), UnsafeVectorCast(v2), options); +} + +TEST_F(TestCast, ToIntDowncastUnsafe) { + CastOptions options; + options.allow_int_overflow = true; + + vector is_valid = {true, false, true, true, true}; + + // int16 to uint8, no overflow/underrun + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, uint8(), e1, + options); + + // int16 to uint8, with overflow + vector v2 = {0, 100, 256, 0, 0}; + vector e2 = {0, 100, 0, 0, 0}; + CheckCase(int16(), v2, is_valid, uint8(), e2, + options); + + // underflow + vector v3 = {0, 100, -1, 0, 0}; + vector e3 = {0, 100, 255, 0, 0}; + CheckCase(int16(), v3, is_valid, uint8(), e3, + options); + + // int32 to int16, no overflow + vector v4 = {0, 1000, 2000, 1, 2}; + vector e4 = {0, 1000, 2000, 1, 2}; + CheckCase(int32(), v4, is_valid, int16(), e4, + options); + + // int32 to int16, overflow + // TODO(wesm): do we want to allow this? we could set to null + vector v5 = {0, 1000, 2000, 70000, 0}; + vector e5 = {0, 1000, 2000, 4464, 0}; + CheckCase(int32(), v5, is_valid, int16(), e5, + options); + + // underflow + // TODO(wesm): do we want to allow this? we could set overflow to null + vector v6 = {0, 1000, 2000, -70000, 0}; + vector e6 = {0, 1000, 2000, -4464, 0}; + CheckCase(int32(), v6, is_valid, int16(), e6, + options); +} + +TEST_F(TestCast, FloatingPointToInt) { + // which means allow_float_truncate == false + auto options = CastOptions::Safe(); + + vector is_valid = {true, false, true, true, true}; + vector all_valid = {true, true, true, true, true}; + + // float32 to int32 no truncation + vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e1 = {1, 0, 0, -1, 5}; + CheckCase(float32(), v1, is_valid, int32(), e1, + options); + CheckCase(float32(), v1, all_valid, int32(), e1, + options); + + // float64 to int32 no truncation + vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e2 = {1, 0, 0, -1, 5}; + CheckCase(float64(), v2, is_valid, int32(), e2, + options); + CheckCase(float64(), v2, all_valid, int32(), e2, + options); + + // float64 to int64 no truncation + vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e3 = {1, 0, 0, -1, 5}; + CheckCase(float64(), v3, is_valid, int64(), e3, + options); + CheckCase(float64(), v3, all_valid, int64(), e3, + options); + + // float64 to int32 truncate + vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; + vector e4 = {1, 0, 0, -1, 5}; + + options.allow_float_truncate = false; + CheckFails(float64(), v4, is_valid, int32(), options); + CheckFails(float64(), v4, all_valid, int32(), options); + + options.allow_float_truncate = true; + CheckCase(float64(), v4, is_valid, int32(), e4, + options); + CheckCase(float64(), v4, all_valid, int32(), e4, + options); + + // float64 to int64 truncate + vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; + vector e5 = {1, 0, 0, -1, 5}; + + options.allow_float_truncate = false; + CheckFails(float64(), v5, is_valid, int64(), options); + CheckFails(float64(), v5, all_valid, int64(), options); + + options.allow_float_truncate = true; + CheckCase(float64(), v5, is_valid, int64(), e5, + options); + CheckCase(float64(), v5, all_valid, int64(), e5, + options); +} + +TEST_F(TestCast, IntToFloatingPoint) { + auto options = CastOptions::Safe(); + + vector all_valid = {true, true, true, true, true}; + vector all_invalid = {false, false, false, false, false}; + + vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; + CheckFails(int64(), v1, all_valid, float32(), options); + + // While it's not safe to convert, all values are null. + CheckCase(int64(), v1, all_invalid, float64(), + UnsafeVectorCast(v1), + options); +} + +TEST_F(TestCast, TimestampToTimestamp) { + CastOptions options; + + auto CheckTimestampCast = + [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, + const vector& from_values, const vector& to_values, + const vector& is_valid) { + CheckCase( + timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, + options); + }; + + vector is_valid = {true, false, true, true, true}; + + // Multiply promotions + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); + + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); + + vector v3 = {0, 100, 200, 1, 2}; + vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); + + vector v4 = {0, 100, 200, 1, 2}; + vector e4 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); + + vector v5 = {0, 100, 200, 1, 2}; + vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); + + vector v6 = {0, 100, 200, 1, 2}; + vector e6 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); + + // Zero copy + vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, + &arr); + CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); + + // ARROW-1773, cast to integer + CheckZeroCopy(*arr, int64()); + + // Divide, truncate + vector v8 = {0, 100123, 200456, 1123, 2456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); + + vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; + vector e9 = {0, 100, 200, 1, 2}; + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); + + vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; + vector e10 = {0, 100, 200, 1, 2}; + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); + + // Disallow truncate, failures + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, + timestamp(TimeUnit::SECOND), options); + CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, + timestamp(TimeUnit::MILLI), options); + CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, + timestamp(TimeUnit::MICRO), options); + CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, + timestamp(TimeUnit::SECOND), options); + CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, + timestamp(TimeUnit::MILLI), options); + CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, + timestamp(TimeUnit::SECOND), options); +} + +TEST_F(TestCast, TimestampToDate32_Date64) { + CastOptions options; + + vector is_valid = {true, true, false}; + + // 2000-01-01, 2000-01-02, null + vector v_nano = {946684800000000000, 946771200000000000, 0}; + vector v_micro = {946684800000000, 946771200000000, 0}; + vector v_milli = {946684800000, 946771200000, 0}; + vector v_second = {946684800, 946771200, 0}; + vector v_day = {10957, 10958, 0}; + + // Simple conversions + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); + + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); + + // Disallow truncate, failures + vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; + vector v_micro_fail = {946684800000001, 946771200000001, 0}; + vector v_milli_fail = {946684800001, 946771200001, 0}; + vector v_second_fail = {946684801, 946771201, 0}; + + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date64(), options); + + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date32(), options); + + // Make sure that nulls are excluded from the truncation checks + vector v_second_nofail = {946684800, 946771200, 1}; + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); +} + +TEST_F(TestCast, TimeToCompatible) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // Multiply promotions + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time32(TimeUnit::SECOND), v1, is_valid, time32(TimeUnit::MILLI), e1, options); + + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckCase( + time32(TimeUnit::SECOND), v2, is_valid, time64(TimeUnit::MICRO), e2, options); + + vector v3 = {0, 100, 200, 1, 2}; + vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; + CheckCase( + time32(TimeUnit::SECOND), v3, is_valid, time64(TimeUnit::NANO), e3, options); + + vector v4 = {0, 100, 200, 1, 2}; + vector e4 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time32(TimeUnit::MILLI), v4, is_valid, time64(TimeUnit::MICRO), e4, options); + + vector v5 = {0, 100, 200, 1, 2}; + vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckCase( + time32(TimeUnit::MILLI), v5, is_valid, time64(TimeUnit::NANO), e5, options); + + vector v6 = {0, 100, 200, 1, 2}; + vector e6 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); + + // Zero copy + vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); + CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int64()); + + vector v7_2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int32()); + + // Divide, truncate + vector v8 = {0, 100123, 200456, 1123, 2456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckCase( + time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), e8, options); + CheckCase( + time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), e8, options); + CheckCase( + time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), e8, options); + + vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; + vector e9 = {0, 100, 200, 1, 2}; + CheckCase( + time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), e9, options); + CheckCase( + time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), e9, options); + + vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; + vector e10 = {0, 100, 200, 1, 2}; + CheckCase( + time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), e10, options); + + // Disallow truncate, failures + + options.allow_time_truncate = false; + CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), + options); + CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), + options); + CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), + options); + CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), + options); + CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), + options); + CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), + options); +} + +TEST_F(TestCast, DateToCompatible) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + constexpr int64_t F = 86400000; + + // Multiply promotion + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; + CheckCase(date32(), v1, is_valid, date64(), + e1, options); + + // Zero copy + vector v2 = {0, 70000, 2000, 1000, 0}; + vector v3 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(date32(), is_valid, v2, &arr); + CheckZeroCopy(*arr, date32()); + + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int32()); + + ArrayFromVector(date64(), is_valid, v3, &arr); + CheckZeroCopy(*arr, date64()); + + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int64()); + + // Divide, truncate + vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckCase(date64(), v8, is_valid, date32(), + e8, options); + + // Disallow truncate, failures + options.allow_time_truncate = false; + CheckFails(date64(), v8, is_valid, date32(), options); +} + +TEST_F(TestCast, ToDouble) { + CastOptions options; + vector is_valid = {true, false, true, true, true}; + + // int16 to double + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, float64(), e1, + options); + + // float to double + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100, 200, 1, 2}; + CheckCase(float32(), v2, is_valid, float64(), e2, + options); + + // bool to double + vector v3 = {true, true, false, false, true}; + vector e3 = {1, 1, 0, 0, 1}; + CheckCase(boolean(), v3, is_valid, float64(), e3, + options); +} + +TEST_F(TestCast, ChunkedArray) { + vector values1 = {0, 1, 2}; + vector values2 = {3, 4, 5}; + + auto type = int16(); + auto out_type = int64(); + + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + CastOptions options; + + Datum out; + ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); + + auto out_carr = out.chunked_array(); + + vector ex_values1 = {0, 1, 2}; + vector ex_values2 = {3, 4, 5}; + auto a3 = _MakeArray(out_type, ex_values1, {}); + auto a4 = _MakeArray(out_type, ex_values2, {}); + + ArrayVector ex_arrays = {a3, a4}; + auto ex_carr = std::make_shared(ex_arrays); + + ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); +} + +TEST_F(TestCast, UnsupportedTarget) { + vector is_valid = {true, false, true, true, true}; + vector v1 = {0, 1, 2, 3, 4}; + + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + shared_ptr result; + ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); +} + +TEST_F(TestCast, DateTimeZeroCopy) { + vector is_valid = {true, false, true, true, true}; + + vector v1 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + CheckZeroCopy(*arr, date32()); + + vector v2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(int64(), is_valid, v2, &arr); + + CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + CheckZeroCopy(*arr, date64()); + CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); +} + +TEST_F(TestCast, FromNull) { + // Null casts to everything + const int length = 10; + + NullArray arr(length); + + shared_ptr result; + ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); + + ASSERT_EQ(length, result->length()); + ASSERT_EQ(length, result->null_count()); + + // OK to look at bitmaps + ASSERT_ARRAYS_EQUAL(*result, *result); +} + +TEST_F(TestCast, PreallocatedMemory) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + const int64_t length = 5; + + shared_ptr arr; + vector v1 = {0, 70000, 2000, 1000, 0}; + vector e1 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(int32(), is_valid, v1, &arr); + + auto out_type = int64(); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); + + auto out_data = ArrayData::Make(out_type, length); + + shared_ptr out_values; + ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); + + out_data->buffers.push_back(nullptr); + out_data->buffers.push_back(out_values); + + Datum out(out_data); + ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); + + // Buffer address unchanged + ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); + + shared_ptr result = MakeArray(out_data); + shared_ptr expected; + ArrayFromVector(int64(), is_valid, e1, &expected); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, + const vector& in_values, + const std::shared_ptr& out_type, + const vector& out_values) { + using OutTraits = TypeTraits; + + CastOptions options; + + const int64_t length = static_cast(in_values.size()); + + shared_ptr arr, expected; + ArrayFromVector(in_type, in_values, &arr); + ArrayFromVector(out_type, out_values, &expected); + + shared_ptr out_buffer; + ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); + + const int64_t first_half = length / 2; + + auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); + auto out_second_data = out_data->Copy(); + out_second_data->offset = first_half; + + Datum out_first(out_data); + Datum out_second(out_second_data); + + // Cast each bit + ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); + + shared_ptr result = MakeArray(out_data); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +TEST_F(TestCast, OffsetOutputBuffer) { + // ARROW-1735 + vector v1 = {0, 10000, 2000, 1000, 0}; + vector e1 = {0, 10000, 2000, 1000, 0}; + + auto in_type = int32(); + auto out_type = int64(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + out_type, e1); + + vector e2 = {false, true, true, true, false}; + + out_type = boolean(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + boolean(), e2); + + vector e3 = {0, 10000, 2000, 1000, 0}; + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + int16(), e3); +} + +TEST_F(TestCast, StringToBoolean) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + vector v1 = {"False", "true", "true", "True", "false"}; + vector v2 = {"0", "1", "1", "1", "0"}; + vector e = {false, true, true, true, false}; + CheckCase(utf8(), v1, is_valid, boolean(), + e, options); + CheckCase(utf8(), v2, is_valid, boolean(), + e, options); +} + +TEST_F(TestCast, StringToBooleanErrors) { + CastOptions options; + + vector is_valid = {true}; + + CheckFails(utf8(), {"false "}, is_valid, boolean(), options); + CheckFails(utf8(), {"T"}, is_valid, boolean(), options); +} + +TEST_F(TestCast, StringToNumber) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // string to int + vector v_int = {"0", "1", "127", "-1", "0"}; + vector e_int8 = {0, 1, 127, -1, 0}; + vector e_int16 = {0, 1, 127, -1, 0}; + vector e_int32 = {0, 1, 127, -1, 0}; + vector e_int64 = {0, 1, 127, -1, 0}; + CheckCase(utf8(), v_int, is_valid, int8(), + e_int8, options); + CheckCase(utf8(), v_int, is_valid, int16(), + e_int16, options); + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + v_int = {"2147483647", "0", "-2147483648", "0", "0"}; + e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; + e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + // string to uint + vector v_uint = {"0", "1", "127", "255", "0"}; + vector e_uint8 = {0, 1, 127, 255, 0}; + vector e_uint16 = {0, 1, 127, 255, 0}; + vector e_uint32 = {0, 1, 127, 255, 0}; + vector e_uint64 = {0, 1, 127, 255, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint8(), e_uint8, options); + CheckCase(utf8(), v_uint, is_valid, + uint16(), e_uint16, options); + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + v_uint = {"4294967295", "0", "0", "0", "0"}; + e_uint32 = {4294967295, 0, 0, 0, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + v_uint = {"18446744073709551615", "0", "0", "0", "0"}; + e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + // string to float + vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; + vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; + vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); + + // Test that casting is locale-independent + auto global_locale = std::locale(); + try { + // French locale uses the comma as decimal point + std::locale::global(std::locale("fr_FR.UTF-8")); + } catch (std::runtime_error&) { + // Locale unavailable, ignore + } + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); + std::locale::global(global_locale); +} + +TEST_F(TestCast, StringToNumberErrors) { + CastOptions options; + + vector is_valid = {true}; + + CheckFails(utf8(), {"z"}, is_valid, int8(), options); + CheckFails(utf8(), {"12 z"}, is_valid, int8(), options); + CheckFails(utf8(), {"128"}, is_valid, int8(), options); + CheckFails(utf8(), {"-129"}, is_valid, int8(), options); + CheckFails(utf8(), {"0.5"}, is_valid, int8(), options); + + CheckFails(utf8(), {"256"}, is_valid, uint8(), options); + CheckFails(utf8(), {"-1"}, is_valid, uint8(), options); + + CheckFails(utf8(), {"z"}, is_valid, float32(), options); +} + +TEST_F(TestCast, StringToTimestamp) { + CastOptions options; + + vector is_valid = {true, false, true}; + vector strings = {"1970-01-01", "xxx", "2000-02-29"}; + + auto type = timestamp(TimeUnit::SECOND); + vector e = {0, 0, 951782400}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + type = timestamp(TimeUnit::MICRO); + e = {0, 0, 951782400000000LL}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc +} + +TEST_F(TestCast, StringToTimestampErrors) { + CastOptions options; + + vector is_valid = {true}; + + for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { + auto type = timestamp(unit); + CheckFails(utf8(), {""}, is_valid, type, options); + CheckFails(utf8(), {"xxx"}, is_valid, type, options); + } +} + +constexpr const char* kInvalidUtf8 = "\xa0\xa1"; + +TEST_F(TestCast, BinaryToString) { + CastOptions options; + + // All valid except the last one + vector all = {1, 1, 1, 1, 1}; + vector valid = {1, 1, 1, 1, 0}; + vector strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; + + std::shared_ptr array; + + // Should accept when invalid but null. + ArrayFromVector(binary(), valid, strings, &array); + CheckZeroCopy(*array, utf8()); + + // Should refuse due to invalid utf8 payload + CheckFails(binary(), strings, all, utf8(), options); + + // Should accept due to option override + options.allow_invalid_utf8 = true; + CheckCase(binary(), strings, all, + utf8(), strings, options); +} + +template +class TestDictionaryCast : public TestCast {}; + +typedef ::testing::Types + TestTypes; + +TYPED_TEST_CASE(TestDictionaryCast, TestTypes); + +TYPED_TEST(TestDictionaryCast, Basic) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + Datum out; + ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); + + this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); +} + +TEST_F(TestCast, DictToNonDictNoNulls) { + vector dict_values = {"foo", "bar", "baz"}; + auto ex_dict = _MakeArray(utf8(), dict_values, {}); + auto dict_type = dictionary(int32(), ex_dict); + + // Explicitly construct with nullptr for the null_bitmap_data + std::vector i1 = {1, 0, 1}; + std::vector i2 = {2, 1, 0, 1}; + auto c1 = std::make_shared>(3, Buffer::Wrap(i1)); + auto c2 = std::make_shared>(4, Buffer::Wrap(i2)); + + ArrayVector dict_arrays = {std::make_shared(dict_type, c1), + std::make_shared(dict_type, c2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum cast_input(dict_carr); + Datum cast_output; + // Ensure that casting works even when the null_bitmap_data array is a nullptr + ASSERT_OK(Cast(&this->ctx_, cast_input, + static_cast(*dict_type).dictionary()->type(), + CastOptions(), &cast_output)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, cast_output.kind()); + + auto e1 = _MakeArray(utf8(), {"bar", "foo", "bar"}, {}); + auto e2 = _MakeArray(utf8(), {"baz", "bar", "foo", "bar"}, {}); + + auto chunks = cast_output.chunked_array()->chunks(); + ASSERT_EQ(chunks.size(), 2); + ASSERT_ARRAYS_EQUAL(*e1, *chunks[0]); + ASSERT_ARRAYS_EQUAL(*e2, *chunks[1]); +} + +/*TYPED_TEST(TestDictionaryCast, Reverse) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + shared_ptr dict_array; + ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); + + this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); +}*/ + +TEST_F(TestCast, ListToList) { + CastOptions options; + std::shared_ptr offsets; + + vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; + std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; + ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + + shared_ptr int32_plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + std::shared_ptr int32_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); + + std::shared_ptr int64_plain_array; + ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); + std::shared_ptr int64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); + + std::shared_ptr float64_plain_array; + ASSERT_OK( + Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); + std::shared_ptr float64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); + + CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options); + CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); + CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options); + CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options); + + options.allow_float_truncate = true; + CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); + CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); +} + +TEST_F(TestCast, IdentityCasts) { + // ARROW-4102 + auto CheckIdentityCast = [this](std::shared_ptr type, + const std::string& json) { + auto arr = ArrayFromJSON(type, json); + CheckZeroCopy(*arr, type); + }; + + CheckIdentityCast(null(), "[null, null, null]"); + CheckIdentityCast(boolean(), "[false, true, null, false]"); + + for (auto type : kNumericTypes) { + CheckIdentityCast(type, "[1, 2, null, 4]"); + } + CheckIdentityCast(binary(), "[\"foo\", \"bar\"]"); + CheckIdentityCast(utf8(), "[\"foo\", \"bar\"]"); + CheckIdentityCast(fixed_size_binary(3), "[\"foo\", \"bar\"]"); + + CheckIdentityCast(list(int8()), "[[1, 2], [null], [], [3]]"); + + CheckIdentityCast(time32(TimeUnit::MILLI), "[1, 2, 3, 4]"); + CheckIdentityCast(time64(TimeUnit::MICRO), "[1, 2, 3, 4]"); + CheckIdentityCast(date32(), "[1, 2, 3, 4]"); + CheckIdentityCast(date64(), "[86400000, 0]"); + CheckIdentityCast(timestamp(TimeUnit::SECOND), "[1, 2, 3, 4]"); + + { + auto dict_type = dictionary(int8(), ArrayFromJSON(int8(), "[1, 2, 3]")); + auto dict_indices = ArrayFromJSON(int8(), "[0, 1, 2, 0, null, 2]"); + auto dict_array = std::make_shared(dict_type, dict_indices); + CheckZeroCopy(*dict_array, dict_type); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 4f7d7f822b3ab..74ee7d6d110f4 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -37,6 +37,7 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parsing.h" // IWYU pragma: keep +#include "arrow/util/utf8.h" #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" @@ -77,6 +78,19 @@ namespace compute { constexpr int64_t kMillisecondsInDay = 86400000; +template +struct is_binary_to_string { + static constexpr bool value = false; +}; + +template +struct is_binary_to_string< + O, I, + typename std::enable_if::value && + std::is_base_of::value>::type> { + static constexpr bool value = true; +}; + // ---------------------------------------------------------------------- // Zero copy casts @@ -85,6 +99,8 @@ struct is_zero_copy_cast { static constexpr bool value = false; }; +// TODO(wesm): ARROW-4110; this is no longer needed, but may be useful if we +// ever _do_ want to generate identity cast kernels at compile time template struct is_zero_copy_cast< O, I, @@ -112,15 +128,30 @@ struct is_zero_copy_cast< static constexpr bool value = sizeof(O_T) == sizeof(I_T); }; +// Binary to String doesn't require copying, the payload only needs to be +// validated. +template +struct is_zero_copy_cast< + O, I, + typename std::enable_if::value && + is_binary_to_string::value>::type> { + static constexpr bool value = true; +}; + template struct CastFunctor {}; // Indicated no computation required +// +// The case BinaryType -> StringType is special cased due to validation +// requirements. template -struct CastFunctor::value>::type> { +struct CastFunctor::value && + !is_binary_to_string::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - CopyData(input, output); + ZeroCopyData(input, output); } }; @@ -373,6 +404,7 @@ struct is_float_truncate< template struct CastFunctor::value>::type> { + ARROW_DISABLE_UBSAN("float-cast-overflow") void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { using in_type = typename I::c_type; @@ -479,11 +511,9 @@ void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool is_m out_data[i] = static_cast(in_data[i] / factor); } } else { -#define RAISE_INVALID_CAST(VAL) \ - std::stringstream ss; \ - ss << "Casting from " << input.type->ToString() << " to " << output->type->ToString() \ - << " would lose data: " << VAL; \ - ctx->SetStatus(Status::Invalid(ss.str())); +#define RAISE_INVALID_CAST(VAL) \ + ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \ + output->type->ToString(), " would lose data: ", VAL)); if (input.null_count != 0) { internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset, @@ -532,7 +562,7 @@ struct CastFunctor { const auto& out_type = checked_cast(*output->type); if (in_type.unit() == out_type.unit()) { - CopyData(input, output); + ZeroCopyData(input, output); return; } @@ -625,7 +655,7 @@ struct CastFunctor(*output->type); if (in_type.unit() == out_type.unit()) { - CopyData(input, output); + ZeroCopyData(input, output); return; } @@ -766,9 +796,8 @@ struct CastFunctor< UnpackFixedSizeBinaryDictionary(ctx, indices, dictionary, output); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -845,9 +874,8 @@ struct CastFunctor(ctx, indices, dictionary, output))); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -903,9 +931,8 @@ struct CastFunctor(indices, dictionary, out); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -931,9 +958,8 @@ struct CastFunctor> { auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { - std::stringstream ss; - ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ", + output->type->ToString())); return; } } @@ -962,10 +988,9 @@ struct CastFunctortype->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", + input_array.GetString(i), "' into ", + output->type->ToString())); return; } @@ -998,17 +1023,62 @@ struct CastFunctor { continue; } - auto str = input_array.GetView(i); + const auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { - std::stringstream ss; - ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ", + output->type->ToString())); return; } } } }; +// ---------------------------------------------------------------------- +// Binary to String +// + +template +struct CastFunctor< + StringType, I, + typename std::enable_if::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + BinaryArray binary(input.Copy()); + + if (options.allow_invalid_utf8) { + ZeroCopyData(input, output); + return; + } + + util::InitializeUTF8(); + + if (binary.null_count() != 0) { + for (int64_t i = 0; i < input.length; i++) { + if (binary.IsNull(i)) { + continue; + } + + const auto str = binary.GetView(i); + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); + return; + } + } + + } else { + for (int64_t i = 0; i < input.length; i++) { + const auto str = binary.GetView(i); + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); + return; + } + } + } + + ZeroCopyData(input, output); + } +}; + // ---------------------------------------------------------------------- typedef std::functiontype->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot pre-allocate memory for type: ", + out->type->ToString()); } if (type_id != Type::NA) { @@ -1077,6 +1146,17 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i return Status::OK(); } +class IdentityCast : public UnaryKernel { + public: + IdentityCast() {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(input.kind(), Datum::ARRAY); + out->value = input.array()->Copy(); + return Status::OK(); + } +}; + class CastKernel : public UnaryKernel { public: CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy, @@ -1088,17 +1168,22 @@ class CastKernel : public UnaryKernel { out_type_(out_type) {} Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { - DCHECK_EQ(Datum::ARRAY, input.kind()); + if (input.kind() != Datum::ARRAY) + return Status::NotImplemented("CastKernel only supports Datum::ARRAY input"); const ArrayData& in_data = *input.array(); - ArrayData* result; - if (out->kind() == Datum::NONE) { - out->value = ArrayData::Make(out_type_, in_data.length); + switch (out->kind()) { + case Datum::NONE: + out->value = ArrayData::Make(out_type_, in_data.length); + break; + case Datum::ARRAY: + break; + default: + return Status::NotImplemented("CastKernel only supports Datum::ARRAY output"); } - result = out->array().get(); - + ArrayData* result = out->array().get(); if (!is_zero_copy_) { RETURN_NOT_OK( AllocateIfNotPreallocated(ctx, in_data, can_pre_allocate_values_, result)); @@ -1117,6 +1202,8 @@ class CastKernel : public UnaryKernel { std::shared_ptr out_type_; }; +// TODO(wesm): ARROW-4110 Do not generate cases that could return IdentityCast + #define CAST_CASE(InType, OutType) \ case OutType::type_id: \ is_zero_copy = is_zero_copy_cast::value; \ @@ -1162,12 +1249,10 @@ class CastKernel : public UnaryKernel { FN(Int64Type, Date64Type); #define DATE32_CASES(FN, IN_TYPE) \ - FN(Date32Type, Date32Type); \ FN(Date32Type, Date64Type); \ FN(Date32Type, Int32Type); #define DATE64_CASES(FN, IN_TYPE) \ - FN(Date64Type, Date64Type); \ FN(Date64Type, Date32Type); \ FN(Date64Type, Int64Type); @@ -1187,8 +1272,9 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); +#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); + #define STRING_CASES(FN, IN_TYPE) \ - FN(StringType, StringType); \ FN(StringType, BooleanType); \ FN(StringType, UInt8Type); \ FN(StringType, Int8Type); \ @@ -1242,25 +1328,26 @@ class CastKernel : public UnaryKernel { return nullptr; \ } -GET_CAST_FUNCTION(NULL_CASES, NullType); -GET_CAST_FUNCTION(NUMERIC_CASES, BooleanType); -GET_CAST_FUNCTION(NUMERIC_CASES, UInt8Type); -GET_CAST_FUNCTION(NUMERIC_CASES, Int8Type); -GET_CAST_FUNCTION(NUMERIC_CASES, UInt16Type); -GET_CAST_FUNCTION(NUMERIC_CASES, Int16Type); -GET_CAST_FUNCTION(NUMERIC_CASES, UInt32Type); -GET_CAST_FUNCTION(INT32_CASES, Int32Type); -GET_CAST_FUNCTION(NUMERIC_CASES, UInt64Type); -GET_CAST_FUNCTION(INT64_CASES, Int64Type); -GET_CAST_FUNCTION(NUMERIC_CASES, FloatType); -GET_CAST_FUNCTION(NUMERIC_CASES, DoubleType); -GET_CAST_FUNCTION(DATE32_CASES, Date32Type); -GET_CAST_FUNCTION(DATE64_CASES, Date64Type); -GET_CAST_FUNCTION(TIME32_CASES, Time32Type); -GET_CAST_FUNCTION(TIME64_CASES, Time64Type); -GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); -GET_CAST_FUNCTION(STRING_CASES, StringType); -GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); +GET_CAST_FUNCTION(NULL_CASES, NullType) +GET_CAST_FUNCTION(NUMERIC_CASES, BooleanType) +GET_CAST_FUNCTION(NUMERIC_CASES, UInt8Type) +GET_CAST_FUNCTION(NUMERIC_CASES, Int8Type) +GET_CAST_FUNCTION(NUMERIC_CASES, UInt16Type) +GET_CAST_FUNCTION(NUMERIC_CASES, Int16Type) +GET_CAST_FUNCTION(NUMERIC_CASES, UInt32Type) +GET_CAST_FUNCTION(INT32_CASES, Int32Type) +GET_CAST_FUNCTION(NUMERIC_CASES, UInt64Type) +GET_CAST_FUNCTION(INT64_CASES, Int64Type) +GET_CAST_FUNCTION(NUMERIC_CASES, FloatType) +GET_CAST_FUNCTION(NUMERIC_CASES, DoubleType) +GET_CAST_FUNCTION(DATE32_CASES, Date32Type) +GET_CAST_FUNCTION(DATE64_CASES, Date64Type) +GET_CAST_FUNCTION(TIME32_CASES, Time32Type) +GET_CAST_FUNCTION(TIME64_CASES, Time64Type) +GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType) +GET_CAST_FUNCTION(BINARY_CASES, BinaryType) +GET_CAST_FUNCTION(STRING_CASES, StringType) +GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType) #define CAST_FUNCTION_CASE(InType) \ case InType::type_id: \ @@ -1289,6 +1376,11 @@ Status GetListCastFunc(const DataType& in_type, const std::shared_ptr& Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, const CastOptions& options, std::unique_ptr* kernel) { + if (in_type.Equals(out_type)) { + *kernel = std::unique_ptr(new IdentityCast); + return Status::OK(); + } + switch (in_type.id()) { CAST_FUNCTION_CASE(NullType); CAST_FUNCTION_CASE(BooleanType); @@ -1307,6 +1399,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& CAST_FUNCTION_CASE(Time32Type); CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); + CAST_FUNCTION_CASE(BinaryType); CAST_FUNCTION_CASE(StringType); CAST_FUNCTION_CASE(DictionaryType); case Type::LIST: @@ -1316,10 +1409,8 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& break; } if (*kernel == nullptr) { - std::stringstream ss; - ss << "No cast implemented from " << in_type.ToString() << " to " - << out_type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("No cast implemented from ", in_type.ToString(), " to ", + out_type->ToString()); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h index 65c70bf14aa88..8c42f07bda7f1 100644 --- a/cpp/src/arrow/compute/kernels/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -38,12 +38,14 @@ struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false), - allow_float_truncate(false) {} + allow_float_truncate(false), + allow_invalid_utf8(false) {} explicit CastOptions(bool safe) : allow_int_overflow(!safe), allow_time_truncate(!safe), - allow_float_truncate(!safe) {} + allow_float_truncate(!safe), + allow_invalid_utf8(!safe) {} static CastOptions Safe() { return CastOptions(true); } @@ -52,6 +54,9 @@ struct ARROW_EXPORT CastOptions { bool allow_int_overflow; bool allow_time_truncate; bool allow_float_truncate; + // Indicate if conversions from Binary/FixedSizeBinary to string must + // validate the utf8 payload. + bool allow_invalid_utf8; }; /// \since 0.7.0 diff --git a/cpp/src/arrow/compute/kernels/hash-test.cc b/cpp/src/arrow/compute/kernels/hash-test.cc new file mode 100644 index 0000000000000..f20575f621b4c --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash-test.cc @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/hash.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +// ---------------------------------------------------------------------- +// Dictionary tests + +template +void CheckUnique(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr expected = _MakeArray(type, out_values, out_is_valid); + + shared_ptr result; + ASSERT_OK(Unique(ctx, input, &result)); + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid, + const vector& out_indices) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); + shared_ptr ex_indices = + _MakeArray(int32(), out_indices, in_is_valid); + + DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); + + Datum datum_out; + ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); + shared_ptr result = MakeArray(datum_out.array()); + + ASSERT_ARRAYS_EQUAL(expected, *result); +} + +class TestHashKernel : public ComputeFixture, public TestBase {}; + +template +class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; + +typedef ::testing::Types + PrimitiveDictionaries; + +TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); + +TYPED_TEST(TestHashKernelPrimitive, Unique) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, + {2, 1}, {}); + CheckUnique(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, + {3, 1}, {}); +} + +TYPED_TEST(TestHashKernelPrimitive, DictEncode) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, + {true, false, true, true, true, true}, {2, 1, 3}, {}, + {0, 0, 0, 1, 0, 2}); +} + +TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { + using T = typename TypeParam::c_type; + // Skip this test for (u)int8 + if (sizeof(Scalar) == 1) { + return; + } + + const int64_t kTotalValues = 1000000; + const int64_t kRepeats = 5; + + vector values; + vector uniques; + vector indices; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + const auto val = static_cast(i % kTotalValues); + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + } + indices.push_back(static_cast(i % kTotalValues)); + } + + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, values, {}, uniques, {}); + + CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueTimeTimestamp) { + CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), + {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, + {}); +} + +TEST_F(TestHashKernel, UniqueBoolean) { + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, + {true, false, true, true}, {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, + {true, false, true, true}, {false, true}, {}); + + // No nulls + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, + {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, + {false, true}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBoolean) { + CheckDictEncode( + &this->ctx_, boolean(), {true, true, false, true, false}, + {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); + + CheckDictEncode( + &this->ctx_, boolean(), {false, true, false, true, false}, + {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); + + // No nulls + CheckDictEncode(&this->ctx_, boolean(), + {true, true, false, true, false}, {}, {true, false}, + {}, {0, 0, 1, 0, 1}); + + CheckDictEncode(&this->ctx_, boolean(), + {false, true, false, true, false}, {}, {false, true}, + {}, {0, 1, 0, 1, 0}); +} + +TEST_F(TestHashKernel, UniqueBinary) { + CheckUnique(&this->ctx_, binary(), + {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); + + CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBinary) { + CheckDictEncode( + &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); + + CheckDictEncode( + &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, BinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + vector values; + vector uniques; + vector indices; + char buf[20] = "test"; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); + values.emplace_back(buf); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, + indices); + + CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, + indices); +} + +TEST_F(TestHashKernel, UniqueFixedSizeBinary) { + CheckUnique( + &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, + {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { + CheckDictEncode( + &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, + {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); +} + +TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + vector values; + vector uniques; + vector indices; + char buf[7] = "test.."; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + buf[4] = static_cast(index / 128); + buf[5] = static_cast(index % 128); + values.emplace_back(buf, 6); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + auto type = fixed_size_binary(6); + CheckUnique(&this->ctx_, type, values, {}, uniques, + {}); + CheckDictEncode(&this->ctx_, type, values, {}, + uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueDecimal) { + vector values{12, 12, 11, 12}; + vector expected{12, 11}; + + CheckUnique(&this->ctx_, decimal(2, 0), values, + {true, false, true, true}, expected, {}); +} + +TEST_F(TestHashKernel, DictEncodeDecimal) { + vector values{12, 12, 11, 12, 13}; + vector expected{12, 11, 13}; + + CheckDictEncode(&this->ctx_, decimal(2, 0), values, + {true, false, true, true, true}, expected, + {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, ChunkedArrayInvoke) { + vector values1 = {"foo", "bar", "foo"}; + vector values2 = {"bar", "baz", "quuux", "foo"}; + + auto type = utf8(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + vector dict_values = {"foo", "bar", "baz", "quuux"}; + auto ex_dict = _MakeArray(type, dict_values, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + // Unique + shared_ptr result; + ASSERT_OK(Unique(&this->ctx_, carr, &result)); + ASSERT_ARRAYS_EQUAL(*ex_dict, *result); + + // Dictionary encode + auto dict_type = dictionary(int32(), ex_dict); + + auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); + auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); + + ArrayVector dict_arrays = {std::make_shared(dict_type, i1), + std::make_shared(dict_type, i2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum encoded_out; + ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); + + AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc index c057ea5736139..0513fe1f6ad4f 100644 --- a/cpp/src/arrow/compute/kernels/hash.cc +++ b/cpp/src/arrow/compute/kernels/hash.cc @@ -56,11 +56,9 @@ namespace compute { namespace { -#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ - if (!KERNEL) { \ - std::stringstream ss; \ - ss << FUNCNAME << " not implemented for " << type->ToString(); \ - return Status::NotImplemented(ss.str()); \ +#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ + if (!KERNEL) { \ + return Status::NotImplemented(FUNCNAME, " not implemented for ", type->ToString()); \ } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/util-internal.cc b/cpp/src/arrow/compute/kernels/util-internal.cc index 81fafcba6167a..04ee9c02f4957 100644 --- a/cpp/src/arrow/compute/kernels/util-internal.cc +++ b/cpp/src/arrow/compute/kernels/util-internal.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "arrow/array.h" @@ -27,6 +28,7 @@ #include "arrow/table.h" #include "arrow/util/logging.h" +#include "arrow/compute/context.h" #include "arrow/compute/kernel.h" namespace arrow { @@ -162,6 +164,47 @@ Datum WrapDatumsLike(const Datum& value, const std::vector& datums) { } } +PrimitiveAllocatingUnaryKernel::PrimitiveAllocatingUnaryKernel( + std::unique_ptr delegate) + : delegate_(std::move(delegate)) {} + +inline void ZeroLastByte(Buffer* buffer) { + *(buffer->mutable_data() + (buffer->size() - 1)) = 0; +} + +Status PrimitiveAllocatingUnaryKernel::Call(FunctionContext* ctx, const Datum& input, + Datum* out) { + std::vector> data_buffers; + const ArrayData& in_data = *input.array(); + MemoryPool* pool = ctx->memory_pool(); + + // Handle the validity buffer. + if (in_data.offset == 0) { + // Validity bitmap will be zero copied + data_buffers.emplace_back(); + } else { + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBitmap(pool, in_data.length, &buffer)); + // Per spec all trailing bits should indicate nullness, since + // the last byte might only be partially set, we ensure the + // remaining bit is set. + ZeroLastByte(buffer.get()); + buffer->ZeroPadding(); + data_buffers.push_back(buffer); + } + // Allocate the boolean value buffer. + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBitmap(pool, in_data.length, &buffer)); + // Some utility methods access the last byte before it might be + // initialized this makes valgrind/asan unhappy, so we proactively + // zero it. + ZeroLastByte(buffer.get()); + data_buffers.push_back(buffer); + out->value = ArrayData::Make(null(), in_data.length, data_buffers); + + return delegate_->Call(ctx, input, out); +} + } // namespace detail } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h index 23ed4fd7ee7d7..2dd8c0288a7e2 100644 --- a/cpp/src/arrow/compute/kernels/util-internal.h +++ b/cpp/src/arrow/compute/kernels/util-internal.h @@ -32,7 +32,9 @@ namespace compute { class FunctionContext; -static inline void CopyData(const ArrayData& input, ArrayData* output) { +// \brief Make a copy of the buffers into a destination array without carrying +// the type. +static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) { output->length = input.length; output->null_count = input.null_count; output->buffers = input.buffers; @@ -42,6 +44,9 @@ static inline void CopyData(const ArrayData& input, ArrayData* output) { namespace detail { +/// \brief Invoke the kernel on value using the ctx and store results in outputs. +/// +/// \param[out] outputs One ArrayData datum for each ArrayData available in value. ARROW_EXPORT Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel, const Datum& value, std::vector* outputs); @@ -61,6 +66,26 @@ Datum WrapArraysLike(const Datum& value, ARROW_EXPORT Datum WrapDatumsLike(const Datum& value, const std::vector& datums); +/// \brief Kernel used to preallocate outputs for primitive types. +class PrimitiveAllocatingUnaryKernel : public UnaryKernel { + public: + explicit PrimitiveAllocatingUnaryKernel(std::unique_ptr delegate); + /// \brief Sets out to be of type ArrayData with the necessary + /// data buffers prepopulated. + /// + /// This method does not populate types on arrays and sets type to null. + /// + /// The current implementation only supports primitive boolean outputs and + /// assumes validity bitmaps that are not sliced will be zero copied (i.e. + /// no allocation happens for them). + /// + /// TODO(ARROW-1896): Make this generic enough to support casts. + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override; + + private: + std::unique_ptr delegate_; +}; + } // namespace detail } // namespace compute diff --git a/cpp/src/arrow/compute/test-util.h b/cpp/src/arrow/compute/test-util.h new file mode 100644 index 0000000000000..e2bda698a9bff --- /dev/null +++ b/cpp/src/arrow/compute/test-util.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_TEST_UTIL_H +#define ARROW_COMPUTE_TEST_UTIL_H + +#include +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/type.h" + +#include "arrow/compute/context.h" + +namespace arrow { +namespace compute { + +class ComputeFixture { + public: + ComputeFixture() : ctx_(default_memory_pool()) {} + + protected: + FunctionContext ctx_; +}; + +template +std::shared_ptr _MakeArray(const std::shared_ptr& type, + const std::vector& values, + const std::vector& is_valid) { + std::shared_ptr result; + if (is_valid.size() > 0) { + ArrayFromVector(type, is_valid, values, &result); + } else { + ArrayFromVector(type, values, &result); + } + return result; +} + +} // namespace compute +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index 84b080b1eef09..2a72dceadad16 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -15,17 +15,18 @@ # specific language governing permissions and limitations # under the License. -ADD_ARROW_TEST(csv-chunker-test) -ADD_ARROW_TEST(csv-column-builder-test) -ADD_ARROW_TEST(csv-converter-test) -ADD_ARROW_TEST(csv-parser-test) +ADD_ARROW_TEST(chunker-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(column-builder-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(converter-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(parser-test + PREFIX "arrow-csv") -ADD_ARROW_BENCHMARK(csv-converter-benchmark) -ADD_ARROW_BENCHMARK(csv-parser-benchmark) +ADD_ARROW_BENCHMARK(converter-benchmark + PREFIX "arrow-csv") +ADD_ARROW_BENCHMARK(parser-benchmark + PREFIX "arrow-csv") -# Headers: top level -file(GLOB_RECURSE ARROW_CSV_HEADERS "*.h") - -install(FILES - ${ARROW_CSV_HEADERS} - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/csv") +ARROW_INSTALL_ALL_HEADERS("arrow/csv") diff --git a/cpp/src/arrow/csv/csv-chunker-test.cc b/cpp/src/arrow/csv/chunker-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-chunker-test.cc rename to cpp/src/arrow/csv/chunker-test.cc diff --git a/cpp/src/arrow/csv/csv-column-builder-test.cc b/cpp/src/arrow/csv/column-builder-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-column-builder-test.cc rename to cpp/src/arrow/csv/column-builder-test.cc diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc index 28cbad47580e8..1f37046798fd7 100644 --- a/cpp/src/arrow/csv/column-builder.cc +++ b/cpp/src/arrow/csv/column-builder.cc @@ -305,12 +305,12 @@ Status InferringColumnBuilder::TryConvertChunk(size_t chunk_index) { void InferringColumnBuilder::Insert(int64_t block_index, const std::shared_ptr& parser) { - DCHECK_NE(converter_, nullptr); - // Create a slot for the new chunk and spawn a task to convert it size_t chunk_index = static_cast(block_index); { std::lock_guard lock(mutex_); + + DCHECK_NE(converter_, nullptr); if (chunks_.size() <= chunk_index) { chunks_.resize(chunk_index + 1); } diff --git a/cpp/src/arrow/csv/column-builder.h b/cpp/src/arrow/csv/column-builder.h index b21cff76be5c6..054a642295cb5 100644 --- a/cpp/src/arrow/csv/column-builder.h +++ b/cpp/src/arrow/csv/column-builder.h @@ -18,22 +18,29 @@ #ifndef ARROW_CSV_COLUMN_BUILDER_H #define ARROW_CSV_COLUMN_BUILDER_H +#include #include -#include #include "arrow/array.h" -#include "arrow/csv/converter.h" -#include "arrow/csv/options.h" -#include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/util/task-group.h" #include "arrow/util/visibility.h" namespace arrow { + +class ChunkedArray; +class DataType; + +namespace internal { + +class TaskGroup; + +} // namespace internal + namespace csv { +class BlockParser; +struct ConvertOptions; + class ARROW_EXPORT ColumnBuilder { public: virtual ~ColumnBuilder() = default; diff --git a/cpp/src/arrow/csv/csv-converter-benchmark.cc b/cpp/src/arrow/csv/converter-benchmark.cc similarity index 100% rename from cpp/src/arrow/csv/csv-converter-benchmark.cc rename to cpp/src/arrow/csv/converter-benchmark.cc diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/converter-test.cc similarity index 82% rename from cpp/src/arrow/csv/csv-converter-test.cc rename to cpp/src/arrow/csv/converter-test.cc index 2534541d3154a..ea12c0b66a94b 100644 --- a/cpp/src/arrow/csv/csv-converter-test.cc +++ b/cpp/src/arrow/csv/converter-test.cc @@ -176,13 +176,30 @@ TEST(IntegerConversion, Basics) { } TEST(IntegerConversion, Nulls) { - AssertConversion(int8(), {"12,34\n", ",-128\n"}, - {{12, 0}, {34, -128}}, - {{true, false}, {true, true}}); + AssertConversion(int8(), {"12,N/A\n", ",-128\n"}, + {{12, 0}, {0, -128}}, + {{true, false}, {false, true}}); AssertConversionAllNulls(int8()); } +TEST(IntegerConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(int8(), {"12,xxx\n", "zzz,-128\n"}, + {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, + options); + + AssertConversionError(int8(), {",xxx,N/A\n"}, {0, 2}, options); + + // Duplicate nulls allowed + options.null_values = {"xxx", "zzz", "xxx"}; + AssertConversion(int8(), {"12,xxx\n", "zzz,-128\n"}, + {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, + options); +} + TEST(IntegerConversion, Whitespace) { AssertConversion(int32(), {" 12,34 \n", " 56 ,78\n"}, {{12, 56}, {34, 78}}); @@ -203,6 +220,15 @@ TEST(FloatingPointConversion, Nulls) { AssertConversionAllNulls(float64()); } +TEST(FloatingPointConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(float32(), {"1.5,xxx\n", "zzz,-1e10\n"}, + {{1.5, 0.}, {0., -1e10f}}, + {{true, false}, {false, true}}, options); +} + TEST(FloatingPointConversion, Whitespace) { AssertConversion(float64(), {" 12,34.5\n", " 0 ,-1e100 \n"}, {{12., 0.}, {34.5, -1e100}}); @@ -220,6 +246,15 @@ TEST(BooleanConversion, Nulls) { {{true, true}, {false, true}}); } +TEST(BooleanConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(boolean(), {"true,xxx\n", "zzz,0\n"}, + {{true, false}, {false, false}}, + {{true, false}, {false, true}}, options); +} + TEST(TimestampConversion, Basics) { auto type = timestamp(TimeUnit::SECOND); @@ -243,6 +278,16 @@ TEST(TimestampConversion, Nulls) { {{true}, {false}, {false}}); } +TEST(TimestampConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + auto type = timestamp(TimeUnit::MILLI); + AssertConversion(type, {"1970-01-01 00:01:00,xxx,zzz\n"}, + {{60000}, {0}, {0}}, + {{true}, {false}, {false}}, options); +} + TEST(DecimalConversion, NotImplemented) { std::shared_ptr converter; ASSERT_RAISES(NotImplemented, diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 7d8bff870ba84..22be7d6e58f3b 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "arrow/builder.h" #include "arrow/csv/parser.h" @@ -28,21 +30,23 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/parsing.h" // IWYU pragma: keep +#include "arrow/util/trie.h" #include "arrow/util/utf8.h" namespace arrow { namespace csv { using internal::StringConverter; +using internal::Trie; +using internal::TrieBuilder; namespace { Status GenericConversionError(const std::shared_ptr& type, const uint8_t* data, uint32_t size) { - std::stringstream ss; - ss << "CSV conversion error to " << type->ToString() << ": invalid value '" - << std::string(reinterpret_cast(data), size) << "'"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type->ToString(), + ": invalid value '", + std::string(reinterpret_cast(data), size), "'"); } inline bool IsWhitespace(uint8_t c) { @@ -57,115 +61,28 @@ class ConcreteConverter : public Converter { using Converter::Converter; protected: - Status Initialize() override { return Status::OK(); } + Status Initialize() override; inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted); + + Trie null_trie_; }; -// Recognize various spellings of null values. The list of possible spellings -// is taken from Pandas read_csv() documentation. +Status ConcreteConverter::Initialize() { + // TODO no need to build a separate Trie for each Converter instance + TrieBuilder builder; + for (const auto& s : options_.null_values) { + RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */)); + } + null_trie_ = builder.Finish(); + return Status::OK(); +} + bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) { if (quoted) { return false; } - if (size == 0) { - return true; - } - // No 1-character null value exists - if (size == 1) { - return false; - } - - // XXX if the CSV parser guaranteed enough excess bytes at the end of the - // parsed area, we wouldn't need to always check size before comparing characters. - - auto chars = reinterpret_cast(data); - auto first = chars[0]; - auto second = chars[1]; - switch (first) { - case 'N': { - // "NA", "N/A", "NaN", "NULL" - if (size == 2) { - return second == 'A'; - } - auto third = chars[2]; - if (size == 3) { - return (second == '/' && third == 'A') || (second == 'a' && third == 'N'); - } - if (size == 4) { - return (second == 'U' && third == 'L' && chars[3] == 'L'); - } - return false; - } - case 'n': { - // "n/a", "nan", "null" - if (size == 2) { - return false; - } - auto third = chars[2]; - if (size == 3) { - return (second == '/' && third == 'a') || (second == 'a' && third == 'n'); - } - if (size == 4) { - return (second == 'u' && third == 'l' && chars[3] == 'l'); - } - return false; - } - case '1': { - // '1.#IND', '1.#QNAN' - if (size == 6) { - // '#' is the most unlikely char here, check it first - return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'I' && - chars[4] == 'N' && chars[5] == 'D'); - } - if (size == 7) { - return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'Q' && - chars[4] == 'N' && chars[5] == 'A' && chars[6] == 'N'); - } - return false; - } - case '-': { - switch (second) { - case 'N': - // "-NaN" - return (size == 4 && chars[2] == 'a' && chars[3] == 'N'); - case 'n': - // "-nan" - return (size == 4 && chars[2] == 'a' && chars[3] == 'n'); - case '1': - // "-1.#IND", "-1.#QNAN" - if (size == 7) { - return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'I' && - chars[5] == 'N' && chars[6] == 'D'); - } - if (size == 8) { - return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'Q' && - chars[5] == 'N' && chars[6] == 'A' && chars[7] == 'N'); - } - return false; - default: - return false; - } - } - case '#': { - // "#N/A", "#N/A N/A", "#NA" - if (size < 3 || chars[1] != 'N') { - return false; - } - auto third = chars[2]; - if (size == 3) { - return third == 'A'; - } - if (size == 4) { - return third == '/' && chars[3] == 'A'; - } - if (size == 8) { - return std::memcmp(data + 2, "/A N/A", 5) == 0; - } - return false; - } - default: - return false; - } + return null_trie_.Find(util::string_view(reinterpret_cast(data), size)) >= + 0; } ///////////////////////////////////////////////////////////////////////// @@ -213,9 +130,8 @@ class VarSizeBinaryConverter : public ConcreteConverter { auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) { - std::stringstream ss; - ss << "CSV conversion error to " << type_->ToString() << ": invalid UTF8 data"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": invalid UTF8 data"); } builder.UnsafeAppend(data, size); return Status::OK(); @@ -255,10 +171,8 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { if (ARROW_PREDICT_FALSE(size != byte_width)) { - std::stringstream ss; - ss << "CSV conversion error to " << type_->ToString() << ": got a " << size - << "-byte long string"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type_->ToString(), ": got a ", + size, "-byte long string"); } return builder.Append(data); }; @@ -409,9 +323,8 @@ Status Converter::Make(const std::shared_ptr& type, break; default: { - std::stringstream ss; - ss << "CSV conversion to " << type->ToString() << " is not supported"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("CSV conversion to ", type->ToString(), + " is not supported"); } #undef CONVERTER_CASE diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h index 38ade1d21a846..d64fe695d0a26 100644 --- a/cpp/src/arrow/csv/converter.h +++ b/cpp/src/arrow/csv/converter.h @@ -57,7 +57,7 @@ class ARROW_EXPORT Converter { virtual Status Initialize() = 0; - ConvertOptions options_; + const ConvertOptions options_; MemoryPool* pool_; std::shared_ptr type_; }; diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc index fccf0b67db98c..01e687b8342a3 100644 --- a/cpp/src/arrow/csv/options.cc +++ b/cpp/src/arrow/csv/options.cc @@ -22,7 +22,14 @@ namespace csv { ParseOptions ParseOptions::Defaults() { return ParseOptions(); } -ConvertOptions ConvertOptions::Defaults() { return ConvertOptions(); } +ConvertOptions ConvertOptions::Defaults() { + auto options = ConvertOptions(); + // The default list of possible null spellings is taken from Pandas' read_csv(). + options.null_values = {"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", + "-NaN", "-nan", "1.#IND", "1.#QNAN", "N/A", "NA", + "NULL", "NaN", "n/a", "nan", "null"}; + return options; +} ReadOptions ReadOptions::Defaults() { return ReadOptions(); } diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 10232d45e8df4..2b4653ccdce81 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/util/visibility.h" @@ -66,6 +67,8 @@ struct ARROW_EXPORT ConvertOptions { bool check_utf8 = true; // Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; + // Recognized spellings for null values + std::vector null_values; static ConvertOptions Defaults(); }; diff --git a/cpp/src/arrow/csv/csv-parser-benchmark.cc b/cpp/src/arrow/csv/parser-benchmark.cc similarity index 100% rename from cpp/src/arrow/csv/csv-parser-benchmark.cc rename to cpp/src/arrow/csv/parser-benchmark.cc diff --git a/cpp/src/arrow/csv/csv-parser-test.cc b/cpp/src/arrow/csv/parser-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-parser-test.cc rename to cpp/src/arrow/csv/parser-test.cc diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index fe7f841f58328..b1d175adfb582 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -30,9 +30,7 @@ namespace arrow { namespace csv { static Status ParseError(const char* message) { - std::stringstream ss; - ss << "CSV parse error: " << message; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV parse error: ", message); } static Status MismatchingColumns(int32_t expected, int32_t actual) { diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h index 8a515744ee2d9..fdddc37a2c0fb 100644 --- a/cpp/src/arrow/csv/parser.h +++ b/cpp/src/arrow/csv/parser.h @@ -18,6 +18,7 @@ #ifndef ARROW_CSV_PARSER_H #define ARROW_CSV_PARSER_H +#include #include #include #include diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 8cf74d6b99901..efd61167b71a5 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include "arrow/buffer.h" @@ -353,10 +355,8 @@ class ThreadedTableReader : public BaseTableReader { chunk_size, &parsed_size)); if (parsed_size != chunk_size) { DCHECK_EQ(parsed_size, chunk_size); - std::stringstream ss; - ss << "Chunker and parser disagree on block size: " << chunk_size << " vs " - << parsed_size; - return Status::Invalid(ss.str()); + return Status::Invalid("Chunker and parser disagree on block size: ", + chunk_size, " vs ", parsed_size); } RETURN_NOT_OK(ProcessData(parser, chunk_index)); // Keep chunk buffer alive within closure and release it at the end diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index 3a16a7834c3c1..d2640a66b2f8f 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -16,17 +16,10 @@ # under the License. add_custom_target(arrow_hiveserver2) +add_custom_target(arrow_hiveserver2-tests) # Headers: top level -install(FILES - api.h - columnar-row-set.h - operation.h - service.h - session.h - types.h - util.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/dbi/hiveserver2") +ARROW_INSTALL_ALL_HEADERS("arrow/dbi/hiveserver2") set(ARROW_HIVESERVER2_SRCS columnar-row-set.cc @@ -111,11 +104,13 @@ set(ARROW_HIVESERVER2_TEST_LINK_LIBS thriftstatic) if (ARROW_BUILD_TESTS) - ADD_ARROW_TEST(hiveserver2-test + ADD_TEST_CASE(hiveserver2-test STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" - LABELS "arrow_hiveserver2" + LABELS "arrow_hiveserver2-tests" ) - set_property(TARGET hiveserver2-test - APPEND_STRING PROPERTY COMPILE_FLAGS - " -Wno-shadow-field") + if (TARGET arrow-hiveserver2-test) + set_property(TARGET arrow-hiveserver2-test + APPEND_STRING PROPERTY COMPILE_FLAGS + " -Wno-shadow-field") + endif() endif(ARROW_BUILD_TESTS) diff --git a/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc b/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc index 7022ff017f48e..a7749161c4676 100644 --- a/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc +++ b/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc @@ -97,10 +97,8 @@ Status Wait(const std::unique_ptr& op, if (op_state == state) { return Status::OK(); } else { - std::stringstream ss; - ss << "Failed to reach state '" << OperationStateToString(state) << "' after " - << retries << " retries."; - return Status::IOError(ss.str()); + return Status::IOError("Failed to reach state '", OperationStateToString(state), + "' after ", retries, " retries"); } } diff --git a/cpp/src/arrow/dbi/hiveserver2/service.cc b/cpp/src/arrow/dbi/hiveserver2/service.cc index e2d3f2a21bf37..502a8a284b86f 100644 --- a/cpp/src/arrow/dbi/hiveserver2/service.cc +++ b/cpp/src/arrow/dbi/hiveserver2/service.cc @@ -92,9 +92,7 @@ Service::Service(const string& host, int port, int conn_timeout, Status Service::Open() { if (impl_->protocol_version < hs2::TProtocolVersion::HIVE_CLI_SERVICE_PROTOCOL_V6) { - std::stringstream ss; - ss << "Unsupported protocol: " << impl_->protocol_version; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported protocol: ", impl_->protocol_version); } impl_->socket.reset(new TSocket(host_, port_)); diff --git a/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc b/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc index d154e143ba290..171eae36816e0 100644 --- a/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc +++ b/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc @@ -204,11 +204,7 @@ Status TStatusToStatus(const hs2::TStatus& tstatus) { return Status::IOError(tstatus.errorMessage); case hs2::TStatusCode::INVALID_HANDLE_STATUS: return Status::Invalid("Invalid handle"); - default: { - std::stringstream ss; - ss << "Unknown TStatusCode " << tstatus.statusCode; - return Status::UnknownError(ss.str()); - } + default: { return Status::UnknownError("Unknown TStatusCode ", tstatus.statusCode); } } } diff --git a/cpp/src/arrow/dbi/hiveserver2/thrift/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/thrift/CMakeLists.txt index be689f935c95c..ed90fe8f9e0d6 100644 --- a/cpp/src/arrow/dbi/hiveserver2/thrift/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/thrift/CMakeLists.txt @@ -78,7 +78,7 @@ endfunction(HS2_THRIFT_GEN) message("Using Thrift compiler: ${THRIFT_COMPILER}") -set(OUTPUT_DIR ${CMAKE_BINARY_DIR}/src) +set(OUTPUT_DIR ${ARROW_BINARY_DIR}/src) file(MAKE_DIRECTORY ${OUTPUT_DIR}) add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ErrorCodes.thrift diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index bc22d60b7131a..f59ea3c5e6757 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -18,24 +18,26 @@ add_custom_target(arrow_flight) # Header files -install(FILES - api.h - client.h - server.h - types.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/flight") +ARROW_INSTALL_ALL_HEADERS("arrow/flight") SET(ARROW_FLIGHT_STATIC_LINK_LIBS - grpc_grpcpp - grpc_grpc - grpc_gpr - grpc_address_sorting - cares) + grpc_grpcpp_static + grpc_grpc_static + grpc_gpr_static + grpc_address_sorting_static + cares_static) + +SET(ARROW_FLIGHT_TEST_STATIC_LINK_LIBS + arrow_static + arrow_flight_static + arrow_testing_static + ${ARROW_FLIGHT_STATIC_LINK_LIBS} + ${PROTOBUF_LIBRARY}) # TODO(wesm): Protobuf shared vs static linking -set(FLIGHT_PROTO_PATH "${CMAKE_SOURCE_DIR}/../format") -set(FLIGHT_PROTO ${CMAKE_SOURCE_DIR}/../format/Flight.proto) +set(FLIGHT_PROTO_PATH "${ARROW_SOURCE_DIR}/../format") +set(FLIGHT_PROTO ${ARROW_SOURCE_DIR}/../format/Flight.proto) set(FLIGHT_GENERATED_PROTO_FILES "${CMAKE_CURRENT_BINARY_DIR}/Flight.pb.cc" @@ -43,14 +45,7 @@ set(FLIGHT_GENERATED_PROTO_FILES "${CMAKE_CURRENT_BINARY_DIR}/Flight.grpc.pb.cc" "${CMAKE_CURRENT_BINARY_DIR}/Flight.grpc.pb.h") -if(PROTOBUF_VENDORED) - set(PROTO_DEPENDS ${FLIGHT_PROTO} protobuf) -else() - set(PROTO_DEPENDS ${FLIGHT_PROTO}) -endif() - -# Get location of grpc_cpp_plugin so we can pass it to protoc -get_property(GRPC_CPP_PLUGIN TARGET gRPC::grpc_cpp_plugin PROPERTY LOCATION) +set(PROTO_DEPENDS ${FLIGHT_PROTO} ${PROTOBUF_LIBRARY}) add_custom_command( OUTPUT ${FLIGHT_GENERATED_PROTO_FILES} @@ -80,26 +75,36 @@ set(ARROW_FLIGHT_SRCS ADD_ARROW_LIB(arrow_flight SOURCES ${ARROW_FLIGHT_SRCS} - DEPENDENCIES arrow_dependencies SHARED_LINK_LIBS arrow_shared ${ARROW_FLIGHT_STATIC_LINK_LIBS} STATIC_LINK_LIBS arrow_static ${ARROW_FLIGHT_STATIC_LINK_LIBS}) ADD_ARROW_TEST(flight-test - EXTRA_LINK_LIBS arrow_flight_static ${ARROW_FLIGHT_STATIC_LINK_LIBS} + EXTRA_LINK_LIBS ${ARROW_FLIGHT_TEST_STATIC_LINK_LIBS} LABELS "arrow_flight") # Build test server for unit tests or benchmarks if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) add_executable(flight-test-server test-server.cc) target_link_libraries(flight-test-server - arrow_flight_static - ${ARROW_FLIGHT_STATIC_LINK_LIBS} + ${ARROW_FLIGHT_TEST_STATIC_LINK_LIBS} + gflags_static + ${GTEST_LIBRARY}) + + add_executable(flight-test-integration-server test-integration-server.cc) + target_link_libraries(flight-test-integration-server + ${ARROW_FLIGHT_TEST_STATIC_LINK_LIBS} + gflags_static + gtest_static) + + add_executable(flight-test-integration-client test-integration-client.cc) + target_link_libraries(flight-test-integration-client + ${ARROW_FLIGHT_TEST_STATIC_LINK_LIBS} gflags_static gtest_static) # This is needed for the unit tests if (ARROW_BUILD_TESTS) - add_dependencies(flight-test flight-test-server) + add_dependencies(arrow-flight-test flight-test-server) endif() endif() @@ -124,7 +129,7 @@ if (ARROW_BUILD_BENCHMARKS) arrow_flight_static ${ARROW_FLIGHT_STATIC_LINK_LIBS} gflags_static - gtest_static) + ${GTEST_LIBRARY}) add_executable(flight-benchmark flight-benchmark.cc @@ -133,7 +138,7 @@ if (ARROW_BUILD_BENCHMARKS) arrow_flight_static ${ARROW_FLIGHT_STATIC_LINK_LIBS} gflags_static - gtest_static) + ${GTEST_LIBRARY}) add_dependencies(flight-benchmark flight-perf-server) endif(ARROW_BUILD_BENCHMARKS) diff --git a/cpp/src/arrow/flight/client.cc b/cpp/src/arrow/flight/client.cc index 94c4928d0220d..e25c1875d669f 100644 --- a/cpp/src/arrow/flight/client.cc +++ b/cpp/src/arrow/flight/client.cc @@ -232,7 +232,16 @@ class FlightStreamReader : public RecordBatchReader { // Validate IPC message RETURN_NOT_OK(ipc::Message::Open(data.metadata, data.body, &message)); - return ipc::ReadRecordBatch(*message, schema_, out); + // The first message is a schema; read it and then try to read a + // record batch. + if (message->type() == ipc::Message::Type::SCHEMA) { + RETURN_NOT_OK(ipc::ReadSchema(*message, &schema_)); + return ReadNext(out); + } else if (message->type() == ipc::Message::Type::RECORD_BATCH) { + return ipc::ReadRecordBatch(*message, schema_, out); + } else { + return Status(StatusCode::Invalid, "Unrecognized message in Flight stream"); + } } else { // Stream is completed stream_finished_ = true; diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index be3d86a0dde77..53bb1755b2995 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -87,7 +87,7 @@ class ARROW_EXPORT FlightClient { /// \brief Given a flight ticket and schema, request to be sent the /// stream. Returns record batch stream reader /// \param[in] ticket - /// \param[in] schema the arrow::Schema for the stream as computed by + /// \param[in] schema the schema of the stream data as computed by /// GetFlightInfo /// \param[out] stream the returned RecordBatchReader /// \return Status @@ -96,6 +96,7 @@ class ARROW_EXPORT FlightClient { /// \brief Initiate DoPut RPC, returns FlightPutWriter interface to /// write. Not yet implemented + /// \param[in] schema the schema of the stream data /// \param[out] stream the created stream to write record batches to /// \return Status Status DoPut(const Schema& schema, std::unique_ptr* stream); diff --git a/cpp/src/arrow/flight/flight-test.cc b/cpp/src/arrow/flight/flight-test.cc index 2d1b2f8477d9a..0389c76adb811 100644 --- a/cpp/src/arrow/flight/flight-test.cc +++ b/cpp/src/arrow/flight/flight-test.cc @@ -53,11 +53,11 @@ namespace arrow { namespace flight { TEST(TestFlight, StartStopTestServer) { - TestServer server("flight-test-server", 92385); + TestServer server("flight-test-server", 30000); server.Start(); ASSERT_TRUE(server.IsRunning()); - sleep_for(0.2); + std::this_thread::sleep_for(std::chrono::duration(0.2)); ASSERT_TRUE(server.IsRunning()); int exit_code = server.Stop(); @@ -79,7 +79,7 @@ class TestFlightClient : public ::testing::Test { // void TearDown() {} void SetUp() { - port_ = 92358; + port_ = 30000; server_.reset(new TestServer("flight-test-server", port_)); server_->Start(); ASSERT_OK(ConnectClient()); diff --git a/cpp/src/arrow/flight/internal.cc b/cpp/src/arrow/flight/internal.cc index 796e6095cdb7f..b4c6b2addcc11 100644 --- a/cpp/src/arrow/flight/internal.cc +++ b/cpp/src/arrow/flight/internal.cc @@ -37,16 +37,13 @@ Status FromGrpcStatus(const grpc::Status& grpc_status) { if (grpc_status.ok()) { return Status::OK(); } - std::stringstream ss; if (grpc_status.error_code() == grpc::StatusCode::UNIMPLEMENTED) { - ss << "gRPC returned unimplemented error, with message: " - << grpc_status.error_message(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("gRPC returned unimplemented error, with message: ", + grpc_status.error_message()); } else { - ss << "gRPC failed with error code " << grpc_status.error_code() - << " and message: " << grpc_status.error_message(); - return Status::IOError(ss.str()); + return Status::IOError("gRPC failed with error code ", grpc_status.error_code(), + " and message: ", grpc_status.error_message()); } } diff --git a/cpp/src/arrow/flight/server.cc b/cpp/src/arrow/flight/server.cc index 46815b5476c67..018c079501f2f 100644 --- a/cpp/src/arrow/flight/server.cc +++ b/cpp/src/arrow/flight/server.cc @@ -102,6 +102,10 @@ class SerializationTraits { int64_t body_size = 0; for (const auto& buffer : msg.body_buffers) { + // Buffer may be null when the row length is zero, or when all + // entries are invalid. + if (!buffer) continue; + body_size += buffer->size(); const int64_t remainder = buffer->size() % 8; @@ -111,7 +115,11 @@ class SerializationTraits { } // 2 bytes for body tag - total_size += 2 + WireFormatLite::LengthDelimitedSize(static_cast(body_size)); + // Only written when there are body buffers + if (msg.body_length > 0) { + total_size += + 2 + WireFormatLite::LengthDelimitedSize(static_cast(body_size)); + } // TODO(wesm): messages over 2GB unlikely to be yet supported if (total_size > kInt32Max) { @@ -135,20 +143,27 @@ class SerializationTraits { pb_stream.WriteRawMaybeAliased(msg.metadata->data(), static_cast(msg.metadata->size())); - // Write body - WireFormatLite::WriteTag(pb::FlightData::kDataBodyFieldNumber, - WireFormatLite::WIRETYPE_LENGTH_DELIMITED, &pb_stream); - pb_stream.WriteVarint32(static_cast(body_size)); + // Don't write tag if there are no body buffers + if (msg.body_length > 0) { + // Write body + WireFormatLite::WriteTag(pb::FlightData::kDataBodyFieldNumber, + WireFormatLite::WIRETYPE_LENGTH_DELIMITED, &pb_stream); + pb_stream.WriteVarint32(static_cast(body_size)); - constexpr uint8_t kPaddingBytes[8] = {0}; + constexpr uint8_t kPaddingBytes[8] = {0}; - for (const auto& buffer : msg.body_buffers) { - pb_stream.WriteRawMaybeAliased(buffer->data(), static_cast(buffer->size())); + for (const auto& buffer : msg.body_buffers) { + // Buffer may be null when the row length is zero, or when all + // entries are invalid. + if (!buffer) continue; - // Write padding if not multiple of 8 - const int remainder = static_cast(buffer->size() % 8); - if (remainder) { - pb_stream.WriteRawMaybeAliased(kPaddingBytes, 8 - remainder); + pb_stream.WriteRawMaybeAliased(buffer->data(), static_cast(buffer->size())); + + // Write padding if not multiple of 8 + const int remainder = static_cast(buffer->size() % 8); + if (remainder) { + pb_stream.WriteRawMaybeAliased(kPaddingBytes, 8 - remainder); + } } } @@ -255,6 +270,14 @@ class FlightServiceImpl : public FlightService::Service { // Requires ServerWriter customization in grpc_customizations.h auto custom_writer = reinterpret_cast*>(writer); + // Write the schema as the first message in the stream + IpcPayload schema_payload; + MemoryPool* pool = default_memory_pool(); + ipc::DictionaryMemo dictionary_memo; + GRPC_RETURN_NOT_OK(ipc::internal::GetSchemaPayload( + *data_stream->schema(), pool, &dictionary_memo, &schema_payload)); + custom_writer->Write(schema_payload, grpc::WriteOptions()); + while (true) { IpcPayload payload; GRPC_RETURN_NOT_OK(data_stream->Next(&payload)); @@ -368,6 +391,8 @@ Status FlightServerBase::ListActions(std::vector* actions) { RecordBatchStream::RecordBatchStream(const std::shared_ptr& reader) : pool_(default_memory_pool()), reader_(reader) {} +std::shared_ptr RecordBatchStream::schema() { return reader_->schema(); } + Status RecordBatchStream::Next(IpcPayload* payload) { std::shared_ptr batch; RETURN_NOT_OK(reader_->ReadNext(&batch)); diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index 89154ac8623e0..b3b8239132b7a 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -28,6 +28,7 @@ #include "arrow/util/visibility.h" #include "arrow/flight/types.h" +#include "arrow/ipc/dictionary.h" namespace arrow { @@ -57,6 +58,9 @@ class ARROW_EXPORT FlightDataStream { public: virtual ~FlightDataStream() = default; + // When the stream starts, send the schema. + virtual std::shared_ptr schema() = 0; + // When the stream is completed, the last payload written will have null // metadata virtual Status Next(ipc::internal::IpcPayload* payload) = 0; @@ -69,6 +73,7 @@ class ARROW_EXPORT RecordBatchStream : public FlightDataStream { public: explicit RecordBatchStream(const std::shared_ptr& reader); + std::shared_ptr schema() override; Status Next(ipc::internal::IpcPayload* payload) override; private: @@ -115,7 +120,7 @@ class ARROW_EXPORT FlightServerBase { std::unique_ptr* info); /// \brief Get a stream of IPC payloads to put on the wire - /// \param[in] ticket an opaque ticket + /// \param[in] request an opaque ticket /// \param[out] stream the returned stream provider /// \return Status virtual Status DoGet(const Ticket& request, std::unique_ptr* stream); diff --git a/cpp/src/arrow/flight/test-integration-client.cc b/cpp/src/arrow/flight/test-integration-client.cc new file mode 100644 index 0000000000000..267025a451cc7 --- /dev/null +++ b/cpp/src/arrow/flight/test-integration-client.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Client implementation for Flight integration testing. Requests the given +// path from the Flight server, which reads that file and sends it as a stream +// to the client. The client writes the server stream to the IPC file format at +// the given output file path. The integration test script then uses the +// existing integration test tools to compare the output binary with the +// original JSON + +#include +#include +#include + +#include + +#include "arrow/io/test-common.h" +#include "arrow/ipc/json.h" +#include "arrow/record_batch.h" + +#include "arrow/flight/server.h" +#include "arrow/flight/test-util.h" + +DEFINE_string(host, "localhost", "Server port to connect to"); +DEFINE_int32(port, 31337, "Server port to connect to"); +DEFINE_string(path, "", "Resource path to request"); +DEFINE_string(output, "", "Where to write requested resource"); + +int main(int argc, char** argv) { + gflags::SetUsageMessage("Integration testing client for Flight."); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + std::unique_ptr client; + ABORT_NOT_OK(arrow::flight::FlightClient::Connect(FLAGS_host, FLAGS_port, &client)); + + arrow::flight::FlightDescriptor descr{ + arrow::flight::FlightDescriptor::PATH, "", {FLAGS_path}}; + std::unique_ptr info; + ABORT_NOT_OK(client->GetFlightInfo(descr, &info)); + + std::shared_ptr schema; + ABORT_NOT_OK(info->GetSchema(&schema)); + + if (info->endpoints().size() == 0) { + std::cerr << "No endpoints returned from Flight server." << std::endl; + return -1; + } + + arrow::flight::Ticket ticket = info->endpoints()[0].ticket; + std::unique_ptr stream; + ABORT_NOT_OK(client->DoGet(ticket, schema, &stream)); + + std::shared_ptr out_file; + ABORT_NOT_OK(arrow::io::FileOutputStream::Open(FLAGS_output, &out_file)); + std::shared_ptr writer; + ABORT_NOT_OK(arrow::ipc::RecordBatchFileWriter::Open(out_file.get(), schema, &writer)); + + std::shared_ptr chunk; + while (true) { + ABORT_NOT_OK(stream->ReadNext(&chunk)); + if (chunk == nullptr) break; + ABORT_NOT_OK(writer->WriteRecordBatch(*chunk)); + } + + ABORT_NOT_OK(writer->Close()); + + return 0; +} diff --git a/cpp/src/arrow/flight/test-integration-server.cc b/cpp/src/arrow/flight/test-integration-server.cc new file mode 100644 index 0000000000000..80813e7f19a4c --- /dev/null +++ b/cpp/src/arrow/flight/test-integration-server.cc @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Example server implementation for integration testing purposes + +#include +#include +#include +#include + +#include + +#include "arrow/io/test-common.h" +#include "arrow/ipc/json.h" +#include "arrow/record_batch.h" + +#include "arrow/flight/server.h" +#include "arrow/flight/test-util.h" + +DEFINE_int32(port, 31337, "Server port to listen on"); + +namespace arrow { +namespace flight { + +class JsonReaderRecordBatchStream : public FlightDataStream { + public: + explicit JsonReaderRecordBatchStream( + std::unique_ptr&& reader) + : index_(0), pool_(default_memory_pool()), reader_(std::move(reader)) {} + + std::shared_ptr schema() override { return reader_->schema(); } + + Status Next(ipc::internal::IpcPayload* payload) override { + if (index_ >= reader_->num_record_batches()) { + // Signal that iteration is over + payload->metadata = nullptr; + return Status::OK(); + } + + std::shared_ptr batch; + RETURN_NOT_OK(reader_->ReadRecordBatch(index_, &batch)); + index_++; + + if (!batch) { + // Signal that iteration is over + payload->metadata = nullptr; + return Status::OK(); + } else { + return ipc::internal::GetRecordBatchPayload(*batch, pool_, payload); + } + } + + private: + int index_; + MemoryPool* pool_; + std::unique_ptr reader_; +}; + +class FlightIntegrationTestServer : public FlightServerBase { + Status ReadJson(const std::string& json_path, + std::unique_ptr* out) { + std::shared_ptr in_file; + std::cout << "Opening JSON file '" << json_path << "'" << std::endl; + RETURN_NOT_OK(io::ReadableFile::Open(json_path, &in_file)); + + int64_t file_size = 0; + RETURN_NOT_OK(in_file->GetSize(&file_size)); + + std::shared_ptr json_buffer; + RETURN_NOT_OK(in_file->Read(file_size, &json_buffer)); + + RETURN_NOT_OK(arrow::ipc::internal::json::JsonReader::Open(json_buffer, out)); + return Status::OK(); + } + + Status GetFlightInfo(const FlightDescriptor& request, + std::unique_ptr* info) override { + if (request.type == FlightDescriptor::PATH) { + if (request.path.size() == 0) { + return Status::Invalid("Invalid path"); + } + + std::unique_ptr reader; + RETURN_NOT_OK(ReadJson(request.path.back(), &reader)); + + FlightEndpoint endpoint1({{request.path.back()}, {}}); + + FlightInfo::Data flight_data; + RETURN_NOT_OK(internal::SchemaToString(*reader->schema(), &flight_data.schema)); + flight_data.descriptor = request; + flight_data.endpoints = {endpoint1}; + flight_data.total_records = reader->num_record_batches(); + flight_data.total_bytes = -1; + FlightInfo value(flight_data); + + *info = std::unique_ptr(new FlightInfo(value)); + return Status::OK(); + } else { + return Status::NotImplemented(request.type); + } + } + + Status DoGet(const Ticket& request, + std::unique_ptr* data_stream) override { + std::unique_ptr reader; + RETURN_NOT_OK(ReadJson(request.ticket, &reader)); + + *data_stream = std::unique_ptr( + new JsonReaderRecordBatchStream(std::move(reader))); + + return Status::OK(); + } +}; + +} // namespace flight +} // namespace arrow + +std::unique_ptr g_server; + +void Shutdown(int signal) { + if (g_server != nullptr) { + g_server->Shutdown(); + } +} + +int main(int argc, char** argv) { + gflags::SetUsageMessage("Integration testing server for Flight."); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // SIGTERM shuts down the server + signal(SIGTERM, Shutdown); + + g_server.reset(new arrow::flight::FlightIntegrationTestServer); + g_server->Run(FLAGS_port); + return 0; +} diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 60407acb0a1ec..204cb5e313f3f 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -19,6 +19,12 @@ # arrow_cuda ####################################### +add_custom_target(arrow_cuda-all) +add_custom_target(arrow_cuda) +add_custom_target(arrow_cuda-benchmarks) +add_custom_target(arrow_cuda-tests) +add_dependencies(arrow_cuda-all arrow_cuda arrow_cuda-tests arrow_cuda-benchmarks) + if (DEFINED ENV{CUDA_HOME}) set(CUDA_TOOLKIT_ROOT_DIR "$ENV{CUDA_HOME}") endif() @@ -49,6 +55,8 @@ ADD_ARROW_LIB(arrow_cuda STATIC_LINK_LIBS ${ARROW_CUDA_SHARED_LINK_LIBS} ) +add_dependencies(arrow_cuda ${ARROW_CUDA_LIBRARIES}) + foreach(LIB_TARGET ${ARROW_CUDA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) @@ -63,21 +71,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda_version.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") -install(FILES - cuda_api.h - cuda_arrow_ipc.h - cuda_context.h - cuda_memory.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") - -# pkg-config support -configure_file(arrow-cuda.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" - @ONLY) - -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_INSTALL_ALL_HEADERS("arrow/gpu") +ARROW_ADD_PKG_CONFIG("arrow-cuda") set(ARROW_CUDA_TEST_LINK_LIBS arrow_cuda_shared @@ -90,9 +85,10 @@ if (ARROW_BUILD_TESTS) endif() if (ARROW_BUILD_BENCHMARKS) - cuda_add_executable(cuda-benchmark cuda-benchmark.cc) - target_link_libraries(cuda-benchmark + cuda_add_executable(arrow-cuda-benchmark cuda-benchmark.cc) + target_link_libraries(arrow-cuda-benchmark arrow_cuda_shared - gtest_static + ${GTEST_LIBRARY} ${ARROW_BENCHMARK_LINK_LIBS}) + add_dependencies(arrow_cuda-benchmarks arrow-cuda-benchmark) endif() diff --git a/cpp/src/arrow/gpu/cuda-test.cc b/cpp/src/arrow/gpu/cuda-test.cc index 5d85a81a23641..628d0f2774a20 100644 --- a/cpp/src/arrow/gpu/cuda-test.cc +++ b/cpp/src/arrow/gpu/cuda-test.cc @@ -343,5 +343,19 @@ TEST_F(TestCudaArrowIpc, BasicWriteRead) { CompareBatch(*batch, *cpu_batch); } +class TestCudaContext : public TestCudaBufferBase { + public: + void SetUp() { TestCudaBufferBase::SetUp(); } +}; + +TEST_F(TestCudaContext, GetDeviceAddress) { + const int64_t kSize = 100; + std::shared_ptr buffer; + uint8_t* devptr = NULL; + ASSERT_OK(context_->Allocate(kSize, &buffer)); + ASSERT_OK(context_->GetDeviceAddress(buffer.get()->mutable_data(), &devptr)); + ASSERT_EQ(buffer.get()->mutable_data(), devptr); +} + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc index 03256a1f52c70..b4d8744cb0bd0 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc @@ -82,9 +82,8 @@ Status ReadMessage(CudaBufferReader* reader, MemoryPool* pool, RETURN_NOT_OK(AllocateBuffer(pool, message_length, &metadata)); RETURN_NOT_OK(reader->Read(message_length, &bytes_read, metadata->mutable_data())); if (bytes_read != message_length) { - std::stringstream ss; - ss << "Expected " << message_length << " metadata bytes, but only got " << bytes_read; - return Status::IOError(ss.str()); + return Status::IOError("Expected ", message_length, " metadata bytes, but only got ", + bytes_read); } return ipc::Message::ReadFrom(metadata, reader, out); diff --git a/cpp/src/arrow/gpu/cuda_common.h b/cpp/src/arrow/gpu/cuda_common.h index a53dd220adda0..2b630c8114325 100644 --- a/cpp/src/arrow/gpu/cuda_common.h +++ b/cpp/src/arrow/gpu/cuda_common.h @@ -34,15 +34,13 @@ namespace cuda { (void)ret; \ } while (0) -#define CU_RETURN_NOT_OK(STMT) \ - do { \ - CUresult ret = (STMT); \ - if (ret != CUDA_SUCCESS) { \ - std::stringstream ss; \ - ss << "Cuda Driver API call in " << __FILE__ << " at line " << __LINE__ \ - << " failed with code " << ret << ": " << #STMT; \ - return Status::IOError(ss.str()); \ - } \ +#define CU_RETURN_NOT_OK(STMT) \ + do { \ + CUresult ret = (STMT); \ + if (ret != CUDA_SUCCESS) { \ + return Status::IOError("Cuda Driver API call in ", __FILE__, " at line ", \ + __LINE__, " failed with code ", ret, ": ", #STMT); \ + } \ } while (0) } // namespace cuda diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index 9e95040837bf5..2f3f1bd3f10de 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -343,5 +343,12 @@ void* CudaContext::handle() const { return impl_->context_handle(); } int CudaContext::device_number() const { return impl_->device().device_num; } +Status CudaContext::GetDeviceAddress(uint8_t* addr, uint8_t** devaddr) { + ContextSaver set_temporary(reinterpret_cast(handle())); + CU_RETURN_NOT_OK(cuPointerGetAttribute(devaddr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, + reinterpret_cast(addr))); + return Status::OK(); +} + } // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index 9a67cea8975d1..938a81561d042 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -37,23 +37,23 @@ class ARROW_EXPORT CudaDeviceManager { static Status GetInstance(CudaDeviceManager** manager); /// \brief Get the CUDA driver context for a particular device - /// \param[in] device_number + /// \param[in] device_number the CUDA device /// \param[out] out cached context - Status GetContext(int gpu_number, std::shared_ptr* ctx); + Status GetContext(int device_number, std::shared_ptr* out); /// \brief Get the shared CUDA driver context for a particular device - /// \param[in] device_number + /// \param[in] device_number the CUDA device /// \param[in] handle CUDA context handler created by another library /// \param[out] out shared context Status GetSharedContext(int device_number, void* handle, std::shared_ptr* out); /// \brief Allocate host memory with fast access to given GPU device - /// \param[in] device_number + /// \param[in] device_number the CUDA device /// \param[in] nbytes number of bytes /// \param[out] out the allocated buffer Status AllocateHost(int device_number, int64_t nbytes, - std::shared_ptr* buffer); + std::shared_ptr* out); Status FreeHost(void* data, int64_t nbytes); @@ -98,15 +98,15 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this* buffer); + std::shared_ptr* out); /// \brief Close memory mapped with IPC buffer /// \param[in] buffer a CudaBuffer referencing /// \return Status - Status CloseIpcBuffer(CudaBuffer* buf); + Status CloseIpcBuffer(CudaBuffer* buffer); /// \brief Block until the all device tasks are completed. Status Synchronize(void); @@ -119,6 +119,20 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_thismutable_data(); size_ = buffer->size(); position_ = 0; + closed_ = false; + } + +#define CHECK_CLOSED() \ + if (closed_) { \ + return Status::Invalid("Operation on closed CudaBufferWriter"); \ } Status Seek(int64_t position) { + CHECK_CLOSED(); if (position < 0 || position >= size_) { return Status::IOError("position out of bounds"); } @@ -234,12 +241,17 @@ class CudaBufferWriter::CudaBufferWriterImpl { Status Close() { if (!closed_) { closed_ = true; - RETURN_NOT_OK(Flush()); + RETURN_NOT_OK(FlushInternal()); } return Status::OK(); } Status Flush() { + CHECK_CLOSED(); + return FlushInternal(); + } + + Status FlushInternal() { if (buffer_size_ > 0 && buffer_position_ > 0) { // Only need to flush when the write has been buffered RETURN_NOT_OK( @@ -253,11 +265,13 @@ class CudaBufferWriter::CudaBufferWriterImpl { bool closed() const { return closed_; } Status Tell(int64_t* position) const { + CHECK_CLOSED(); *position = position_; return Status::OK(); } Status Write(const void* data, int64_t nbytes) { + CHECK_CLOSED(); if (nbytes == 0) { return Status::OK(); } @@ -283,11 +297,13 @@ class CudaBufferWriter::CudaBufferWriterImpl { Status WriteAt(int64_t position, const void* data, int64_t nbytes) { std::lock_guard guard(lock_); + CHECK_CLOSED(); RETURN_NOT_OK(Seek(position)); return Write(data, nbytes); } Status SetBufferSize(const int64_t buffer_size) { + CHECK_CLOSED(); if (buffer_position_ > 0) { // Flush any buffered data RETURN_NOT_OK(Flush()); @@ -303,6 +319,8 @@ class CudaBufferWriter::CudaBufferWriterImpl { int64_t buffer_position() const { return buffer_position_; } +#undef CHECK_CLOSED + private: std::shared_ptr context_; std::shared_ptr buffer_; diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index c8f80837cd9df..193deed82e554 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -57,7 +57,9 @@ class ARROW_EXPORT CudaBuffer : public Buffer { std::shared_ptr* out); /// \brief Copy memory from GPU device to CPU host - /// \param[out] out a pre-allocated output buffer + /// \param[in] position start position inside buffer to copy bytes from + /// \param[in] nbytes number of bytes to copy + /// \param[out] out start address of the host memory area to copy to /// \return Status Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const; @@ -69,8 +71,8 @@ class ARROW_EXPORT CudaBuffer : public Buffer { Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes); /// \brief Copy memory from device to device at position - /// \param[in] position start position to copy bytes - /// \param[in] data the device data to copy + /// \param[in] position start position inside buffer to copy bytes to + /// \param[in] data start address of the device memory area to copy from /// \param[in] nbytes number of bytes to copy /// \return Status /// @@ -207,7 +209,7 @@ class ARROW_EXPORT CudaBufferWriter : public io::WritableFile { }; /// \brief Allocate CUDA-accessible memory on CPU host -/// \param[in] device_number +/// \param[in] device_number device to expose host memory /// \param[in] size number of bytes /// \param[out] out the allocated buffer /// \return Status diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index d21bb16755271..13b577f7d41b2 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -18,28 +18,27 @@ # ---------------------------------------------------------------------- # arrow_io : Arrow IO interfaces -ADD_ARROW_TEST(io-buffered-test) -ADD_ARROW_TEST(io-compressed-test) -ADD_ARROW_TEST(io-file-test) +ADD_ARROW_TEST(buffered-test + PREFIX "arrow-io") +ADD_ARROW_TEST(compressed-test + PREFIX "arrow-io") +ADD_ARROW_TEST(file-test + PREFIX "arrow-io") if (ARROW_HDFS AND NOT ARROW_BOOST_HEADER_ONLY) - ADD_ARROW_TEST(io-hdfs-test NO_VALGRIND) + ADD_ARROW_TEST(hdfs-test NO_VALGRIND + PREFIX "arrow-io") endif() -ADD_ARROW_TEST(io-memory-test) -ADD_ARROW_TEST(io-readahead-test) +ADD_ARROW_TEST(memory-test + PREFIX "arrow-io") +ADD_ARROW_TEST(readahead-test + PREFIX "arrow-io") -ADD_ARROW_BENCHMARK(io-file-benchmark) -ADD_ARROW_BENCHMARK(io-memory-benchmark) +ADD_ARROW_BENCHMARK(file-benchmark + PREFIX "arrow-io") +ADD_ARROW_BENCHMARK(memory-benchmark + PREFIX "arrow-io") # Headers: top level -install(FILES - api.h - buffered.h - compressed.h - file.h - hdfs.h - interfaces.h - memory.h - readahead.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/io") +ARROW_INSTALL_ALL_HEADERS("arrow/io") diff --git a/cpp/src/arrow/io/api.h b/cpp/src/arrow/io/api.h index 0d5742ad65864..cf1be337fd1a9 100644 --- a/cpp/src/arrow/io/api.h +++ b/cpp/src/arrow/io/api.h @@ -18,6 +18,7 @@ #ifndef ARROW_IO_API_H #define ARROW_IO_API_H +#include "arrow/io/buffered.h" #include "arrow/io/compressed.h" #include "arrow/io/file.h" #include "arrow/io/hdfs.h" diff --git a/cpp/src/arrow/io/io-buffered-test.cc b/cpp/src/arrow/io/buffered-test.cc similarity index 97% rename from cpp/src/arrow/io/io-buffered-test.cc rename to cpp/src/arrow/io/buffered-test.cc index 7fc4c520d148b..ee07556de16f9 100644 --- a/cpp/src/arrow/io/io-buffered-test.cc +++ b/cpp/src/arrow/io/buffered-test.cc @@ -67,7 +67,7 @@ class FileTestFixture : public ::testing::Test { void EnsureFileDeleted() { if (FileExists(path_)) { - std::remove(path_.c_str()); + ARROW_UNUSED(std::remove(path_.c_str())); } } @@ -105,7 +105,8 @@ class TestBufferedOutputStream : public FileTestFixture { lseek(fd_, 0, SEEK_END); #endif } - ASSERT_OK(BufferedOutputStream::Create(file, buffer_size, &buffered_)); + ASSERT_OK(BufferedOutputStream::Create(buffer_size, default_memory_pool(), file, + &buffered_)); } void WriteChunkwise(const std::string& datastr, const std::valarray& sizes) { @@ -301,7 +302,7 @@ TEST_F(TestBufferedOutputStream, TruncatesFile) { // ---------------------------------------------------------------------- // BufferedInputStream tests -const char kExample1[] = ("informaticacrobaticsimmolation"); +const char kExample1[] = "informaticacrobaticsimmolation"; class TestBufferedInputStream : public FileTestFixture { public: @@ -321,7 +322,7 @@ class TestBufferedInputStream : public FileTestFixture { std::shared_ptr file_in; ASSERT_OK(ReadableFile::Open(path_, &file_in)); raw_ = file_in; - ASSERT_OK(BufferedInputStream::Create(raw_, buffer_size, pool, &buffered_)); + ASSERT_OK(BufferedInputStream::Create(buffer_size, pool, raw_, &buffered_)); } protected: diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index 0c04ac21c208e..0b1431f440fa2 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -21,10 +21,10 @@ #include #include #include -#include #include #include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" @@ -91,8 +91,8 @@ class BufferedBase { class BufferedOutputStream::Impl : public BufferedBase { public: - explicit Impl(std::shared_ptr raw) - : BufferedBase(default_memory_pool()), raw_(std::move(raw)) {} + explicit Impl(std::shared_ptr raw, MemoryPool* pool) + : BufferedBase(pool), raw_(std::move(raw)) {} Status Close() { std::lock_guard guard(lock_); @@ -173,14 +173,16 @@ class BufferedOutputStream::Impl : public BufferedBase { std::shared_ptr raw_; }; -BufferedOutputStream::BufferedOutputStream(std::shared_ptr raw) - : impl_(new BufferedOutputStream::Impl(std::move(raw))) {} +BufferedOutputStream::BufferedOutputStream(std::shared_ptr raw, + MemoryPool* pool) { + impl_.reset(new Impl(std::move(raw), pool)); +} -Status BufferedOutputStream::Create(std::shared_ptr raw, - int64_t buffer_size, +Status BufferedOutputStream::Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, std::shared_ptr* out) { - auto result = - std::shared_ptr(new BufferedOutputStream(std::move(raw))); + auto result = std::shared_ptr( + new BufferedOutputStream(std::move(raw), pool)); RETURN_NOT_OK(result->SetBufferSize(buffer_size)); *out = std::move(result); return Status::OK(); @@ -217,12 +219,12 @@ std::shared_ptr BufferedOutputStream::raw() const { return impl_-> // ---------------------------------------------------------------------- // BufferedInputStream implementation -class BufferedInputStream::BufferedInputStreamImpl : public BufferedBase { +class BufferedInputStream::Impl : public BufferedBase { public: - BufferedInputStreamImpl(std::shared_ptr raw, MemoryPool* pool) + Impl(std::shared_ptr raw, MemoryPool* pool) : BufferedBase(pool), raw_(std::move(raw)), bytes_buffered_(0) {} - ~BufferedInputStreamImpl() { DCHECK_OK(Close()); } + ~Impl() { DCHECK_OK(Close()); } Status Close() { std::lock_guard guard(lock_); @@ -350,13 +352,13 @@ class BufferedInputStream::BufferedInputStreamImpl : public BufferedBase { BufferedInputStream::BufferedInputStream(std::shared_ptr raw, MemoryPool* pool) { - impl_.reset(new BufferedInputStreamImpl(std::move(raw), pool)); + impl_.reset(new Impl(std::move(raw), pool)); } BufferedInputStream::~BufferedInputStream() { DCHECK_OK(impl_->Close()); } -Status BufferedInputStream::Create(std::shared_ptr raw, int64_t buffer_size, - MemoryPool* pool, +Status BufferedInputStream::Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, std::shared_ptr* out) { auto result = std::shared_ptr(new BufferedInputStream(std::move(raw), pool)); diff --git a/cpp/src/arrow/io/buffered.h b/cpp/src/arrow/io/buffered.h index e4374ba8079d3..945915bfe998f 100644 --- a/cpp/src/arrow/io/buffered.h +++ b/cpp/src/arrow/io/buffered.h @@ -29,6 +29,7 @@ namespace arrow { +class Buffer; class MemoryPool; class Status; @@ -39,12 +40,13 @@ class ARROW_EXPORT BufferedOutputStream : public OutputStream { ~BufferedOutputStream() override; /// \brief Create a buffered output stream wrapping the given output stream. + /// \param[in] buffer_size the size of the temporary write buffer + /// \param[in] pool a MemoryPool to use for allocations /// \param[in] raw another OutputStream - /// \param[in] buffer_size the size of the temporary buffer. Allocates from - /// the default memory pool /// \param[out] out the created BufferedOutputStream /// \return Status - static Status Create(std::shared_ptr raw, int64_t buffer_size, + static Status Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, std::shared_ptr* out); /// \brief Resize internal buffer @@ -78,7 +80,7 @@ class ARROW_EXPORT BufferedOutputStream : public OutputStream { std::shared_ptr raw() const; private: - explicit BufferedOutputStream(std::shared_ptr raw); + explicit BufferedOutputStream(std::shared_ptr raw, MemoryPool* pool); class ARROW_NO_EXPORT Impl; std::unique_ptr impl_; @@ -93,12 +95,13 @@ class ARROW_EXPORT BufferedInputStream : public InputStream { ~BufferedInputStream() override; /// \brief Create a BufferedInputStream from a raw InputStream - /// \param[in] raw a raw InputStream /// \param[in] buffer_size the size of the temporary read buffer /// \param[in] pool a MemoryPool to use for allocations + /// \param[in] raw a raw InputStream /// \param[out] out the created BufferedInputStream - static Status Create(std::shared_ptr raw, int64_t buffer_size, - MemoryPool* pool, std::shared_ptr* out); + static Status Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, + std::shared_ptr* out); /// \brief Resize internal read buffer; calls to Read(...) will read at least /// \param[in] new_buffer_size the new read buffer size @@ -137,8 +140,8 @@ class ARROW_EXPORT BufferedInputStream : public InputStream { private: explicit BufferedInputStream(std::shared_ptr raw, MemoryPool* pool); - class ARROW_NO_EXPORT BufferedInputStreamImpl; - std::unique_ptr impl_; + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; }; } // namespace io diff --git a/cpp/src/arrow/io/io-compressed-test.cc b/cpp/src/arrow/io/compressed-test.cc similarity index 98% rename from cpp/src/arrow/io/io-compressed-test.cc rename to cpp/src/arrow/io/compressed-test.cc index 507302f384c0b..ce6533ca2604c 100644 --- a/cpp/src/arrow/io/io-compressed-test.cc +++ b/cpp/src/arrow/io/compressed-test.cc @@ -73,7 +73,7 @@ std::shared_ptr CompressDataOneShot(Codec* codec, ABORT_NOT_OK(codec->Compress(data.size(), data.data(), max_compressed_len, compressed->mutable_data(), &compressed_len)); ABORT_NOT_OK(compressed->Resize(compressed_len)); - return compressed; + return std::move(compressed); } Status RunCompressedInputStream(Codec* codec, std::shared_ptr compressed, @@ -199,12 +199,14 @@ TEST_P(CompressedInputStreamTest, InvalidData) { INSTANTIATE_TEST_CASE_P(TestGZipInputStream, CompressedInputStreamTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTDInputStream, CompressedInputStreamTest, - ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestBrotliInputStream, CompressedInputStreamTest, ::testing::Values(Compression::BROTLI)); +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTDInputStream, CompressedInputStreamTest, + ::testing::Values(Compression::ZSTD)); +#endif + class CompressedOutputStreamTest : public ::testing::TestWithParam { protected: Compression::type GetCompression() { return GetParam(); } @@ -235,11 +237,13 @@ TEST_P(CompressedOutputStreamTest, RandomData) { INSTANTIATE_TEST_CASE_P(TestGZipOutputStream, CompressedOutputStreamTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTDOutputStream, CompressedOutputStreamTest, - ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestBrotliOutputStream, CompressedOutputStreamTest, ::testing::Values(Compression::BROTLI)); +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTDOutputStream, CompressedOutputStreamTest, + ::testing::Values(Compression::ZSTD)); +#endif + } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index e5fd6b4adf4c7..1311dbc246634 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -44,7 +44,7 @@ namespace io { class CompressedOutputStream::Impl { public: Impl(MemoryPool* pool, Codec* codec, const std::shared_ptr& raw) - : pool_(pool), raw_(raw), codec_(codec), is_open_(true) {} + : pool_(pool), raw_(raw), codec_(codec), is_open_(true), compressed_pos_(0) {} ~Impl() { DCHECK(Close().ok()); } diff --git a/cpp/src/arrow/io/io-file-benchmark.cc b/cpp/src/arrow/io/file-benchmark.cc similarity index 93% rename from cpp/src/arrow/io/io-file-benchmark.cc rename to cpp/src/arrow/io/file-benchmark.cc index c57fa6d605d68..3e99ba077acb3 100644 --- a/cpp/src/arrow/io/io-file-benchmark.cc +++ b/cpp/src/arrow/io/file-benchmark.cc @@ -30,12 +30,18 @@ #include #include +#ifndef _WIN32 + #include #include #include +#endif + namespace arrow { +#ifndef _WIN32 + std::string GetNullFile() { return "/dev/null"; } const std::valarray small_sizes = {8, 24, 33, 1, 32, 192, 16, 40}; @@ -163,7 +169,8 @@ static void BM_BufferedOutputStreamSmallWritesToNull( ABORT_NOT_OK(io::FileOutputStream::Open(GetNullFile(), &file)); std::shared_ptr buffered_file; - ABORT_NOT_OK(io::BufferedOutputStream::Create(file, kBufferSize, &buffered_file)); + ABORT_NOT_OK(io::BufferedOutputStream::Create(kBufferSize, default_memory_pool(), file, + &buffered_file)); BenchmarkStreamingWrites(state, small_sizes, buffered_file.get()); } @@ -196,7 +203,8 @@ static void BM_BufferedOutputStreamSmallWritesToPipe( SetupPipeWriter(&stream, &reader); std::shared_ptr buffered_stream; - ABORT_NOT_OK(io::BufferedOutputStream::Create(stream, kBufferSize, &buffered_stream)); + ABORT_NOT_OK(io::BufferedOutputStream::Create(kBufferSize, default_memory_pool(), + stream, &buffered_stream)); BenchmarkStreamingWrites(state, small_sizes, buffered_stream.get(), reader.get()); } @@ -207,7 +215,8 @@ static void BM_BufferedOutputStreamLargeWritesToPipe( SetupPipeWriter(&stream, &reader); std::shared_ptr buffered_stream; - ABORT_NOT_OK(io::BufferedOutputStream::Create(stream, kBufferSize, &buffered_stream)); + ABORT_NOT_OK(io::BufferedOutputStream::Create(kBufferSize, default_memory_pool(), + stream, &buffered_stream)); BenchmarkStreamingWrites(state, large_sizes, buffered_stream.get(), reader.get()); } @@ -241,4 +250,6 @@ BENCHMARK(BM_BufferedOutputStreamLargeWritesToPipe) ->MinTime(1.0) ->UseRealTime(); +#endif // ifndef _WIN32 + } // namespace arrow diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/file-test.cc similarity index 99% rename from cpp/src/arrow/io/io-file-test.cc rename to cpp/src/arrow/io/file-test.cc index 6081005a8f6e1..f329ae9d504e5 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -56,7 +56,7 @@ class FileTestFixture : public ::testing::Test { void EnsureFileDeleted() { if (FileExists(path_)) { - std::remove(path_.c_str()); + ARROW_UNUSED(std::remove(path_.c_str())); } } @@ -460,9 +460,7 @@ class MyMemoryPool : public MemoryPool { *ptr = reinterpret_cast(std::realloc(*ptr, new_size)); if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("realloc of size ", new_size, " failed"); } return Status::OK(); @@ -470,10 +468,10 @@ class MyMemoryPool : public MemoryPool { int64_t bytes_allocated() const override { return -1; } - int64_t num_allocations() const { return num_allocations_; } + int64_t num_allocations() const { return num_allocations_.load(); } private: - int64_t num_allocations_; + std::atomic num_allocations_; }; TEST_F(TestReadableFile, CustomMemoryPool) { diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 869d8e3720766..0398d5a1f9e80 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -479,9 +479,7 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { void* result = mmap(nullptr, static_cast(initial_size), prot_flags_, map_mode_, file_->fd(), 0); if (result == MAP_FAILED) { - std::stringstream ss; - ss << "Memory mapping file failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return Status::IOError("Memory mapping file failed: ", std::strerror(errno)); } size_ = capacity_ = initial_size; data_ = mutable_data_ = static_cast(result); diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index c8be5164cfa78..c273ab45f634f 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -218,9 +218,7 @@ static arrow::Status try_dlopen(std::vector potential_paths, const cha } if (out_handle == NULL) { - std::stringstream ss; - ss << "Unable to load " << name; - return arrow::Status::IOError(ss.str()); + return arrow::Status::IOError("Unable to load ", name); } return arrow::Status::OK(); @@ -243,9 +241,7 @@ static arrow::Status try_dlopen(std::vector potential_paths, const cha } if (out_handle == NULL) { - std::stringstream ss; - ss << "Unable to load " << name; - return arrow::Status::IOError(ss.str()); + return arrow::Status::IOError("Unable to load ", name); } return arrow::Status::OK(); diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/hdfs-test.cc similarity index 96% rename from cpp/src/arrow/io/io-hdfs-test.cc rename to cpp/src/arrow/io/hdfs-test.cc index c853b2012666e..08a7e13a1f8a2 100644 --- a/cpp/src/arrow/io/io-hdfs-test.cc +++ b/cpp/src/arrow/io/hdfs-test.cc @@ -257,6 +257,23 @@ TYPED_TEST(TestHadoopFileSystem, GetPathInfo) { ASSERT_EQ(size, info.size); } +TYPED_TEST(TestHadoopFileSystem, GetPathInfoNotExist) { + // ARROW-2919: Test that the error message is reasonable + SKIP_IF_NO_DRIVER(); + + ASSERT_OK(this->MakeScratchDir()); + auto path = this->ScratchPath("path-does-not-exist"); + + HdfsPathInfo info; + Status s = this->client_->GetPathInfo(path, &info); + ASSERT_TRUE(s.IsIOError()); + + const std::string error_message = s.ToString(); + + // Check that the file path is found in the error message + ASSERT_LT(error_message.find(path), std::string::npos); +} + TYPED_TEST(TestHadoopFileSystem, AppendToFile) { SKIP_IF_NO_DRIVER(); @@ -377,6 +394,8 @@ TYPED_TEST(TestHadoopFileSystem, LargeFile) { std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); + ASSERT_FALSE(file->closed()); + std::shared_ptr buffer; ASSERT_OK(AllocateBuffer(nullptr, size, &buffer)); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6f01f75eec3c1..0a50d3dcdcd90 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -43,14 +43,25 @@ using std::size_t; namespace arrow { namespace io { -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS " << WHAT << " failed, errno: " << errno << " (" << strerror(errno) \ - << ")"; \ - return Status::IOError(ss.str()); \ - } \ +namespace { + +std::string TranslateErrno(int error_code) { + std::stringstream ss; + ss << error_code << " (" << strerror(error_code) << ")"; + if (error_code == 255) { + // Unknown error can occur if the host is correct but the port is not + ss << " Please check that you are connecting to the correct HDFS RPC port"; + } + return ss.str(); +} + +} // namespace + +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + return Status::IOError("HDFS ", WHAT, " failed, errno: ", TranslateErrno(errno)); \ + } \ } while (0) static constexpr int kDefaultHdfsBufferSize = 1 << 16; @@ -99,6 +110,16 @@ class HdfsAnyFileImpl { bool is_open_; }; +namespace { + +Status GetPathInfoFailed(const std::string& path) { + std::stringstream ss; + ss << "Calling GetPathInfo for " << path << " failed. errno: " << TranslateErrno(errno); + return Status::IOError(ss.str()); +} + +} // namespace + // Private implementation for read-only files class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { public: @@ -180,7 +201,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { Status GetSize(int64_t* size) { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); if (entry == nullptr) { - return Status::IOError("HDFS: GetPathInfo failed"); + return GetPathInfoFailed(path_); } *size = entry->mSize; @@ -204,7 +225,7 @@ HdfsReadableFile::HdfsReadableFile(MemoryPool* pool) { impl_.reset(new HdfsReadableFileImpl(pool)); } -HdfsReadableFile::~HdfsReadableFile() { DCHECK(impl_->Close().ok()); } +HdfsReadableFile::~HdfsReadableFile() { DCHECK_OK(impl_->Close()); } Status HdfsReadableFile::Close() { return impl_->Close(); } @@ -272,7 +293,7 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } -HdfsOutputStream::~HdfsOutputStream() { DCHECK(impl_->Close().ok()); } +HdfsOutputStream::~HdfsOutputStream() { DCHECK_OK(impl_->Close()); } Status HdfsOutputStream::Close() { return impl_->Close(); } @@ -315,7 +336,7 @@ static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { // Private implementation class HadoopFileSystem::HadoopFileSystemImpl { public: - HadoopFileSystemImpl() {} + HadoopFileSystemImpl() : driver_(NULLPTR), port_(0), fs_(NULLPTR) {} Status Connect(const HdfsConnectionConfig* config) { if (config->driver == HdfsDriver::LIBHDFS3) { @@ -399,7 +420,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); if (entry == nullptr) { - return Status::IOError("HDFS: GetPathInfo failed"); + return GetPathInfoFailed(path); } SetPathInfo(entry, info); @@ -443,10 +464,8 @@ class HadoopFileSystem::HadoopFileSystemImpl { if ((errno == 0) || (errno == ENOENT && Exists(path))) { num_entries = 0; } else { - std::stringstream ss; - ss << "HDFS list directory failed, errno: " << errno << " (" << strerror(errno) - << ")"; - return Status::IOError(ss.str()); + return Status::IOError("HDFS list directory failed, errno: ", + TranslateErrno(errno)); } } @@ -469,14 +488,9 @@ class HadoopFileSystem::HadoopFileSystemImpl { hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); if (handle == nullptr) { - std::stringstream ss; - if (!Exists(path)) { - ss << "HDFS file does not exist: " << path; - } else { - // TODO(wesm): determine other causes of failure - ss << "HDFS path exists, but opening file failed: " << path; - } - return Status::IOError(ss.str()); + const char* msg = !Exists(path) ? "HDFS file does not exist: " + : "HDFS path exists, but opening file failed: "; + return Status::IOError(msg, path); } // std::make_shared does not work with private ctors @@ -498,10 +512,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { static_cast(default_block_size)); if (handle == nullptr) { - // TODO(wesm): determine cause of failure - std::stringstream ss; - ss << "Unable to open file " << path; - return Status::IOError(ss.str()); + return Status::IOError("Unable to open file ", path); } // std::make_shared does not work with private ctors diff --git a/cpp/src/arrow/io/io-memory-benchmark.cc b/cpp/src/arrow/io/io-memory-benchmark.cc deleted file mode 100644 index 72a5dc8ac2a7f..0000000000000 --- a/cpp/src/arrow/io/io-memory-benchmark.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/api.h" -#include "arrow/io/memory.h" -#include "arrow/test-util.h" - -#include "benchmark/benchmark.h" - -#include - -namespace arrow { - -static void BM_SerialMemcopy(benchmark::State& state) { // NOLINT non-const reference - constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB - - std::shared_ptr buffer1, buffer2; - ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer1)); - ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer2)); - random_bytes(kTotalSize, 0, buffer2->mutable_data()); - - while (state.KeepRunning()) { - io::FixedSizeBufferWriter writer(buffer1); - ABORT_NOT_OK(writer.Write(buffer2->data(), buffer2->size())); - } - state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); -} - -static void BM_ParallelMemcopy(benchmark::State& state) { // NOLINT non-const reference - constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB - - std::shared_ptr buffer1, buffer2; - ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer1)); - ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer2)); - - random_bytes(kTotalSize, 0, buffer2->mutable_data()); - - while (state.KeepRunning()) { - io::FixedSizeBufferWriter writer(buffer1); - writer.set_memcopy_threads(4); - ABORT_NOT_OK(writer.Write(buffer2->data(), buffer2->size())); - } - state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); -} - -BENCHMARK(BM_SerialMemcopy)->MinTime(1.0)->Repetitions(2)->UseRealTime(); - -BENCHMARK(BM_ParallelMemcopy)->MinTime(1.0)->Repetitions(2)->UseRealTime(); - -} // namespace arrow diff --git a/cpp/src/arrow/io/memory-benchmark.cc b/cpp/src/arrow/io/memory-benchmark.cc new file mode 100644 index 0000000000000..b36be4de1639c --- /dev/null +++ b/cpp/src/arrow/io/memory-benchmark.cc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include + +#include "arrow/api.h" +#include "arrow/io/memory.h" +#include "arrow/test-util.h" +#include "arrow/util/cpu-info.h" + +#include "benchmark/benchmark.h" + +namespace arrow { + +static const int kNumCores = internal::CpuInfo::GetInstance()->num_cores(); +constexpr size_t kMemoryPerCore = 32 * 1024 * 1024; +using BufferPtr = std::shared_ptr; + +using VectorType = __m128i; + +// See http://codearcana.com/posts/2013/05/18/achieving-maximum-memory-bandwidth.html +// for the usage of stream loads/writes. Or section 6.1, page 47 of +// https://akkadia.org/drepper/cpumemory.pdf . + +static void Read(void* src, void* dst, size_t size) { + auto simd = static_cast(src); + (void)dst; + + for (size_t i = 0; i < size / sizeof(VectorType); i++) + benchmark::DoNotOptimize(_mm_stream_load_si128(&simd[i])); +} + +static void Write(void* src, void* dst, size_t size) { + auto simd = static_cast(dst); + const VectorType ones = _mm_set1_epi32(1); + (void)src; + + for (size_t i = 0; i < size / sizeof(VectorType); i++) _mm_stream_si128(&simd[i], ones); +} + +static void ReadWrite(void* src, void* dst, size_t size) { + auto src_simd = static_cast(src); + auto dst_simd = static_cast(dst); + + for (size_t i = 0; i < size / sizeof(VectorType); i++) + _mm_stream_si128(&dst_simd[i], _mm_stream_load_si128(&src_simd[i])); +} + +using ApplyFn = decltype(Read); + +template +static void MemoryBandwidth(benchmark::State& state) { // NOLINT non-const reference + const size_t buffer_size = kMemoryPerCore; + BufferPtr src, dst; + + ABORT_NOT_OK(AllocateBuffer(buffer_size, &src)); + ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst)); + random_bytes(buffer_size, 0, src->mutable_data()); + + while (state.KeepRunning()) { + Apply(src->mutable_data(), dst->mutable_data(), buffer_size); + } + + state.SetBytesProcessed(state.iterations() * buffer_size); +} + +// `UseRealTime` is required due to threads, otherwise the cumulative CPU time +// is used which will skew the results by the number of threads. +BENCHMARK_TEMPLATE(MemoryBandwidth, Read)->ThreadRange(1, kNumCores)->UseRealTime(); +BENCHMARK_TEMPLATE(MemoryBandwidth, Write)->ThreadRange(1, kNumCores)->UseRealTime(); +BENCHMARK_TEMPLATE(MemoryBandwidth, ReadWrite)->ThreadRange(1, kNumCores)->UseRealTime(); + +static void ParallelMemoryCopy(benchmark::State& state) { // NOLINT non-const reference + const int64_t n_threads = state.range(0); + const int64_t buffer_size = kMemoryPerCore; + + std::shared_ptr src, dst; + ABORT_NOT_OK(AllocateBuffer(buffer_size, &src)); + ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst)); + + random_bytes(buffer_size, 0, src->mutable_data()); + + while (state.KeepRunning()) { + io::FixedSizeBufferWriter writer(dst); + writer.set_memcopy_threads(static_cast(n_threads)); + ABORT_NOT_OK(writer.Write(src->data(), src->size())); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * buffer_size); + state.counters["threads"] = static_cast(n_threads); +} + +BENCHMARK(ParallelMemoryCopy)->RangeMultiplier(2)->Range(1, kNumCores)->UseRealTime(); + +} // namespace arrow diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/memory-test.cc similarity index 100% rename from cpp/src/arrow/io/io-memory-test.cc rename to cpp/src/arrow/io/memory-test.cc diff --git a/cpp/src/arrow/io/io-readahead-test.cc b/cpp/src/arrow/io/readahead-test.cc similarity index 82% rename from cpp/src/arrow/io/io-readahead-test.cc rename to cpp/src/arrow/io/readahead-test.cc index b7f404f666983..6575e898590d8 100644 --- a/cpp/src/arrow/io/io-readahead-test.cc +++ b/cpp/src/arrow/io/readahead-test.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,51 @@ using internal::checked_cast; namespace io { namespace internal { +class LockedInputStream : public InputStream { + public: + explicit LockedInputStream(const std::shared_ptr& stream) + : stream_(stream) {} + + Status Close() override { + std::lock_guard lock(mutex_); + return stream_->Close(); + } + + bool closed() const override { + std::lock_guard lock(mutex_); + return stream_->closed(); + } + + Status Tell(int64_t* position) const override { + std::lock_guard lock(mutex_); + return stream_->Tell(position); + } + + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override { + std::lock_guard lock(mutex_); + return stream_->Read(nbytes, bytes_read, buffer); + } + + Status Read(int64_t nbytes, std::shared_ptr* out) override { + std::lock_guard lock(mutex_); + return stream_->Read(nbytes, out); + } + + bool supports_zero_copy() const override { + std::lock_guard lock(mutex_); + return stream_->supports_zero_copy(); + } + + util::string_view Peek(int64_t nbytes) const override { + std::lock_guard lock(mutex_); + return stream_->Peek(nbytes); + } + + protected: + std::shared_ptr stream_; + mutable std::mutex mutex_; +}; + static void sleep_for(double seconds) { std::this_thread::sleep_for( std::chrono::nanoseconds(static_cast(seconds * 1e9))); @@ -57,13 +103,13 @@ static void busy_wait(double seconds, std::function predicate) { } } -std::shared_ptr DataReader(const std::string& data) { +std::shared_ptr DataReader(const std::string& data) { std::shared_ptr buffer; ABORT_NOT_OK(Buffer::FromString(data, &buffer)); - return std::make_shared(buffer); + return std::make_shared(std::make_shared(buffer)); } -static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, +static int64_t WaitForPosition(const FileInterface& file, int64_t expected, double seconds = 0.2) { int64_t pos = -1; busy_wait(seconds, [&]() -> bool { @@ -73,12 +119,12 @@ static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, return pos; } -static void AssertEventualPosition(const RandomAccessFile& file, int64_t expected) { +static void AssertEventualPosition(const FileInterface& file, int64_t expected) { int64_t pos = WaitForPosition(file, expected); ASSERT_EQ(pos, expected) << "File didn't reach expected position"; } -static void AssertPosition(const RandomAccessFile& file, int64_t expected) { +static void AssertPosition(const FileInterface& file, int64_t expected) { int64_t pos = -1; ABORT_NOT_OK(file.Tell(&pos)); ASSERT_EQ(pos, expected) << "File didn't reach expected position"; diff --git a/cpp/src/arrow/io/readahead.cc b/cpp/src/arrow/io/readahead.cc index 89db6a66e8c8d..4222f87a5ca3b 100644 --- a/cpp/src/arrow/io/readahead.cc +++ b/cpp/src/arrow/io/readahead.cc @@ -162,11 +162,13 @@ class ReadaheadSpooler::Impl { int64_t bytes_read; RETURN_NOT_OK(AllocateResizableBuffer( pool_, read_size_ + buf->left_padding + buf->right_padding, &buffer)); + DCHECK_NE(buffer->mutable_data(), nullptr); RETURN_NOT_OK( raw_->Read(read_size_, &bytes_read, buffer->mutable_data() + buf->left_padding)); if (bytes_read < read_size_) { // Got a short read RETURN_NOT_OK(buffer->Resize(bytes_read + buf->left_padding + buf->right_padding)); + DCHECK_NE(buffer->mutable_data(), nullptr); } // Zero padding areas memset(buffer->mutable_data(), 0, buf->left_padding); diff --git a/cpp/src/arrow/io/test-common.h b/cpp/src/arrow/io/test-common.h index fa9145259b182..d33e101175633 100644 --- a/cpp/src/arrow/io/test-common.h +++ b/cpp/src/arrow/io/test-common.h @@ -25,16 +25,11 @@ #include #include -#ifndef _MSC_VER -#include -#endif - -#if defined(__MINGW32__) // MinGW -// nothing -#elif defined(_MSC_VER) // Visual Studio +#ifdef _WIN32 +#include #include -#else // POSIX / Linux -// nothing +#else +#include #endif #include "arrow/buffer.h" @@ -64,7 +59,7 @@ static inline bool FileExists(const std::string& path) { return std::ifstream(path.c_str()).good(); } -#if defined(_MSC_VER) +#if defined(_WIN32) static inline void InvalidParamHandler(const wchar_t* expr, const wchar_t* func, const wchar_t* source_file, unsigned int source_line, uintptr_t reserved) { @@ -74,7 +69,7 @@ static inline void InvalidParamHandler(const wchar_t* expr, const wchar_t* func, #endif static inline bool FileIsClosed(int fd) { -#if defined(_MSC_VER) +#if defined(_WIN32) // Disables default behavior on wrong params which causes the application to crash // https://msdn.microsoft.com/en-us/library/ksazx244.aspx _set_invalid_parameter_handler(InvalidParamHandler); @@ -118,7 +113,7 @@ class MemoryMapFixture { public: void TearDown() { for (auto path : tmp_files_) { - std::remove(path.c_str()); + ARROW_UNUSED(std::remove(path.c_str())); } } diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9c384c3e9901c..796758252979e 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -15,25 +15,34 @@ # specific language governing permissions and limitations # under the License. +# Targets required for protocol integration testing +add_custom_target(integration) +add_dependencies(arrow-tests integration) + ####################################### # Messaging and interprocess communication ADD_ARROW_TEST(feather-test) -ADD_ARROW_TEST(ipc-read-write-test) -ADD_ARROW_TEST(ipc-json-test) +ADD_ARROW_TEST(read-write-test + PREFIX "arrow-ipc") +ADD_ARROW_TEST(json-simple-test + PREFIX "arrow-ipc") +ADD_ARROW_TEST(json-test + PREFIX "arrow-ipc") if (NOT ARROW_BOOST_HEADER_ONLY) ADD_ARROW_TEST(json-integration-test EXTRA_LINK_LIBS gflags_static) # Test is being built - if (TARGET json-integration-test) + if (TARGET arrow-json-integration-test) + add_dependencies(integration arrow-json-integration-test) if (UNIX) if (APPLE) - set_target_properties(json-integration-test + set_target_properties(arrow-json-integration-test PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() - target_link_libraries(json-integration-test PRIVATE pthread) + target_link_libraries(arrow-json-integration-test PRIVATE pthread) endif() endif() endif() @@ -44,17 +53,17 @@ set_source_files_properties(Message_generated.h PROPERTIES GENERATED TRUE) set_source_files_properties(feather_generated.h PROPERTIES GENERATED TRUE) set_source_files_properties(File_generated.h PROPERTIES GENERATED TRUE) -set(OUTPUT_DIR ${CMAKE_BINARY_DIR}/src/arrow/ipc) +set(OUTPUT_DIR ${ARROW_BINARY_DIR}/src/arrow/ipc) set(FBS_OUTPUT_FILES "${OUTPUT_DIR}/File_generated.h" "${OUTPUT_DIR}/Message_generated.h" "${OUTPUT_DIR}/feather_generated.h") set(FBS_SRC - ${CMAKE_SOURCE_DIR}/../format/Message.fbs - ${CMAKE_SOURCE_DIR}/../format/File.fbs - ${CMAKE_SOURCE_DIR}/../format/Schema.fbs - ${CMAKE_SOURCE_DIR}/../format/Tensor.fbs + ${ARROW_SOURCE_DIR}/../format/Message.fbs + ${ARROW_SOURCE_DIR}/../format/File.fbs + ${ARROW_SOURCE_DIR}/../format/Schema.fbs + ${ARROW_SOURCE_DIR}/../format/Tensor.fbs ${CMAKE_CURRENT_SOURCE_DIR}/feather.fbs) foreach(FIL ${FBS_SRC}) @@ -79,15 +88,7 @@ add_custom_command( add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) # Headers: top level -install(FILES - api.h - dictionary.h - feather.h - json.h - message.h - reader.h - writer.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/ipc") +ARROW_INSTALL_ALL_HEADERS("arrow/ipc") if (ARROW_BUILD_STATIC) set(ARROW_UTIL_LIB arrow_static) @@ -108,12 +109,15 @@ if(NOT WIN32) endif() if (ARROW_BUILD_UTILITIES) - add_executable(file-to-stream file-to-stream.cc) - target_link_libraries(file-to-stream ${UTIL_LINK_LIBS}) - add_executable(stream-to-file stream-to-file.cc) - target_link_libraries(stream-to-file ${UTIL_LINK_LIBS}) -endif() + add_executable(arrow-file-to-stream file-to-stream.cc) + target_link_libraries(arrow-file-to-stream ${UTIL_LINK_LIBS}) + add_executable(arrow-stream-to-file stream-to-file.cc) + target_link_libraries(arrow-stream-to-file ${UTIL_LINK_LIBS}) -ADD_ARROW_BENCHMARK(ipc-read-write-benchmark) + add_dependencies(integration arrow-file-to-stream) + add_dependencies(integration arrow-stream-to-file) +endif() +ADD_ARROW_BENCHMARK(read-write-benchmark + PREFIX "arrow-ipc") ADD_ARROW_FUZZING(ipc-fuzzing-test) diff --git a/cpp/src/arrow/ipc/dictionary.cc b/cpp/src/arrow/ipc/dictionary.cc index 488bb75b9d75f..aa0d9085f5a8f 100644 --- a/cpp/src/arrow/ipc/dictionary.cc +++ b/cpp/src/arrow/ipc/dictionary.cc @@ -34,9 +34,7 @@ Status DictionaryMemo::GetDictionary(int64_t id, std::shared_ptr* dictionary) const { auto it = id_to_dictionary_.find(id); if (it == id_to_dictionary_.end()) { - std::stringstream ss; - ss << "Dictionary with id " << id << " not found"; - return Status::KeyError(ss.str()); + return Status::KeyError("Dictionary with id ", id, " not found"); } *dictionary = it->second; return Status::OK(); @@ -70,9 +68,7 @@ bool DictionaryMemo::HasDictionaryId(int64_t id) const { Status DictionaryMemo::AddDictionary(int64_t id, const std::shared_ptr& dictionary) { if (HasDictionaryId(id)) { - std::stringstream ss; - ss << "Dictionary with id " << id << " already exists"; - return Status::KeyError(ss.str()); + return Status::KeyError("Dictionary with id ", id, " already exists"); } intptr_t address = reinterpret_cast(dictionary.get()); id_to_dictionary_[id] = dictionary; diff --git a/cpp/src/arrow/ipc/feather-internal.h b/cpp/src/arrow/ipc/feather-internal.h index 90512dd117238..2aa04b2db72ba 100644 --- a/cpp/src/arrow/ipc/feather-internal.h +++ b/cpp/src/arrow/ipc/feather-internal.h @@ -119,7 +119,7 @@ class ARROW_EXPORT TableBuilder { class ARROW_EXPORT TableMetadata { public: - TableMetadata() {} + TableMetadata() : table_(NULLPTR) {} ~TableMetadata() = default; Status Open(const std::shared_ptr& buffer) { diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index b0be28925cf23..8139c47e09fca 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -30,6 +30,7 @@ #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index ebdb335fa57f7..d28bf7512999a 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -180,6 +180,7 @@ ColumnBuilder::ColumnBuilder(TableBuilder* parent, const std::string& name) fbb_ = &parent->fbb(); name_ = name; type_ = ColumnType::PRIMITIVE; + meta_time_.unit = TimeUnit::SECOND; } flatbuffers::Offset ColumnBuilder::CreateColumnMetadata() { @@ -642,9 +643,7 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()))) { - std::stringstream ss; - ss << "Array is not primitive type: " << values.type()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); } meta->type = ToFlatbufferType(values.type_id()); diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 3e71415c69654..fe69a53a944c7 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -170,10 +170,8 @@ static Status ValidateArrowVsJson(const std::string& arrow_path, const int arrow_nbatches = arrow_reader->num_record_batches(); if (json_nbatches != arrow_nbatches) { - std::stringstream ss; - ss << "Different number of record batches: " << json_nbatches << " (JSON) vs " - << arrow_nbatches << " (Arrow)"; - return Status::Invalid(ss.str()); + return Status::Invalid("Different number of record batches: ", json_nbatches, + " (JSON) vs ", arrow_nbatches, " (Arrow)"); } std::shared_ptr arrow_batch; @@ -231,9 +229,7 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, return ValidateArrowVsJson(arrow_path, json_path); } else { - std::stringstream ss; - ss << "Unknown command: " << command; - return Status::Invalid(ss.str()); + return Status::Invalid("Unknown command: ", command); } } @@ -262,7 +258,7 @@ class TestJSONIntegration : public ::testing::Test { void TearDown() { for (const std::string path : tmp_paths_) { - std::remove(path.c_str()); + ARROW_UNUSED(std::remove(path.c_str())); } } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index d5a5dd9f397db..05e547506c596 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -633,9 +633,7 @@ static Status GetInteger(const rj::Value::ConstObject& json_type, *type = is_signed ? int64() : uint64(); break; default: - std::stringstream ss; - ss << "Invalid bit width: " << bit_width; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid bit width: ", bit_width); } return Status::OK(); } @@ -654,9 +652,7 @@ static Status GetFloatingPoint(const RjObject& json_type, } else if (precision == "HALF") { *type = float16(); } else { - std::stringstream ss; - ss << "Invalid precision: " << precision; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid precision: ", precision); } return Status::OK(); } @@ -693,9 +689,7 @@ static Status GetDate(const RjObject& json_type, std::shared_ptr* type } else if (unit_str == "MILLISECOND") { *type = date64(); } else { - std::stringstream ss; - ss << "Invalid date unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid date unit: ", unit_str); } return Status::OK(); } @@ -718,9 +712,7 @@ static Status GetTime(const RjObject& json_type, std::shared_ptr* type } else if (unit_str == "NANOSECOND") { *type = time64(TimeUnit::NANO); } else { - std::stringstream ss; - ss << "Invalid time unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid time unit: ", unit_str); } const auto& fw_type = checked_cast(**type); @@ -749,9 +741,7 @@ static Status GetTimestamp(const RjObject& json_type, std::shared_ptr* } else if (unit_str == "NANOSECOND") { unit = TimeUnit::NANO; } else { - std::stringstream ss; - ss << "Invalid time unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid time unit: ", unit_str); } const auto& it_tz = json_type.FindMember("timezone"); @@ -778,9 +768,7 @@ static Status GetUnion(const RjObject& json_type, } else if (mode_str == "DENSE") { mode = UnionMode::DENSE; } else { - std::stringstream ss; - ss << "Invalid union mode: " << mode_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid union mode: ", mode_str); } const auto& it_type_codes = json_type.FindMember("typeIds"); @@ -838,9 +826,7 @@ static Status GetType(const RjObject& json_type, } else if (type_name == "union") { return GetUnion(json_type, children, type); } else { - std::stringstream ss; - ss << "Unrecognized type name: " << type_name; - return Status::Invalid(ss.str()); + return Status::Invalid("Unrecognized type name: ", type_name); } return Status::OK(); } @@ -1235,10 +1221,8 @@ class ArrayReader { const auto& json_children_arr = json_children->value.GetArray(); if (type.num_children() != static_cast(json_children_arr.Size())) { - std::stringstream ss; - ss << "Expected " << type.num_children() << " children, but got " - << json_children_arr.Size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", type.num_children(), " children, but got ", + json_children_arr.Size()); } for (int i = 0; i < static_cast(json_children_arr.Size()); ++i) { @@ -1342,9 +1326,7 @@ static Status ReadDictionary(const RjObject& obj, const DictionaryTypeMap& id_to auto it = id_to_field.find(id); if (it == id_to_field.end()) { - std::stringstream ss; - ss << "No dictionary with id " << id; - return Status::Invalid(ss.str()); + return Status::Invalid("No dictionary with id ", id); } std::vector> fields = {it->second}; @@ -1489,9 +1471,7 @@ Status ReadArray(MemoryPool* pool, const rj::Value& json_array, const Schema& sc } if (result == nullptr) { - std::stringstream ss; - ss << "Field named " << name << " not found in schema"; - return Status::KeyError(ss.str()); + return Status::KeyError("Field named ", name, " not found in schema"); } return ReadArray(pool, json_array, result->type(), array); diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 8807a56551789..c8c724968f67c 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -36,6 +36,7 @@ #include "rapidjson/document.h" // IWYU pragma: export #include "rapidjson/encodings.h" // IWYU pragma: export +#include "rapidjson/error/en.h" // IWYU pragma: export #include "rapidjson/stringbuffer.h" // IWYU pragma: export #include "rapidjson/writer.h" // IWYU pragma: export @@ -48,56 +49,39 @@ using RjWriter = rj::Writer; using RjArray = rj::Value::ConstArray; using RjObject = rj::Value::ConstObject; -#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ - if (NAME == (PARENT).MemberEnd()) { \ - std::stringstream ss; \ - ss << "field " << TOK << " not found"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == (PARENT).MemberEnd()) { \ + return Status::Invalid("field ", TOK, " not found"); \ } -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + return Status::Invalid("field was not a string line ", __LINE__); \ } -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + return Status::Invalid("field was not a boolean line ", __LINE__); \ } -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + return Status::Invalid("field was not an int line ", __LINE__); \ } -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + return Status::Invalid("field was not an array line ", __LINE__); \ } -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + return Status::Invalid("field was not an object line ", __LINE__); \ } namespace arrow { diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc new file mode 100644 index 0000000000000..2e80a0ca85822 --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -0,0 +1,635 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" + +#if defined(_MSC_VER) +// "warning C4307: '+': integral constant overflow" +#pragma warning(disable : 4307) +#endif + +namespace arrow { +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; + +// Avoid undefined behaviour on signed overflow +template +Signed SafeSignedAdd(Signed u, Signed v) { + using Unsigned = typename std::make_unsigned::type; + return static_cast(static_cast(u) + static_cast(v)); +} + +// Special case for 8-bit ints (must output their decimal value, not the +// corresponding ASCII character) +void JSONArrayInternal(std::ostream* ss, int8_t value) { + *ss << static_cast(value); +} + +void JSONArrayInternal(std::ostream* ss, uint8_t value) { + *ss << static_cast(value); +} + +template +void JSONArrayInternal(std::ostream* ss, const Value& value) { + *ss << value; +} + +template +void JSONArrayInternal(std::ostream* ss, const Value& value, Tail... tail) { + JSONArrayInternal(ss, value); + *ss << ", "; + JSONArrayInternal(ss, std::forward(tail)...); +} + +template +std::string JSONArray(Args... args) { + std::stringstream ss; + ss << "["; + JSONArrayInternal(&ss, std::forward(args)...); + ss << "]"; + return ss.str(); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& values) { + std::shared_ptr actual, expected; + + ASSERT_OK(ArrayFromJSON(type, json, &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector(type, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& is_valid, + const std::vector& values) { + std::shared_ptr actual, expected; + + ASSERT_OK(ArrayFromJSON(type, json, &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector(type, is_valid, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestHelper, JSONArray) { + // Test the JSONArray helper func + std::string s = + JSONArray(123, -4.5, static_cast(-12), static_cast(34)); + ASSERT_EQ(s, "[123, -4.5, -12, 34]"); + s = JSONArray(9223372036854775807LL, 9223372036854775808ULL, -9223372036854775807LL - 1, + 18446744073709551615ULL); + ASSERT_EQ(s, + "[9223372036854775807, 9223372036854775808, -9223372036854775808, " + "18446744073709551615]"); +} + +TEST(TestHelper, SafeSignedAdd) { + ASSERT_EQ(0, SafeSignedAdd(-128, -128)); + ASSERT_EQ(1, SafeSignedAdd(-128, -127)); + ASSERT_EQ(-128, SafeSignedAdd(1, 127)); + ASSERT_EQ(-2147483648LL, SafeSignedAdd(1, 2147483647)); +} + +template +class TestIntegers : public ::testing::Test {}; + +TYPED_TEST_CASE_P(TestIntegers); + +TYPED_TEST_P(TestIntegers, Basics) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr expected, actual; + std::shared_ptr type = TypeTraits::type_singleton(); + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); + AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); + + // Test limits + const auto min_val = std::numeric_limits::min(); + const auto max_val = std::numeric_limits::max(); + std::string json_string = JSONArray(0, 1, min_val); + AssertJSONArray(type, json_string, {0, 1, min_val}); + json_string = JSONArray(0, 1, max_val); + AssertJSONArray(type, json_string, {0, 1, max_val}); +} + +TYPED_TEST_P(TestIntegers, Errors) { + using T = TypeParam; + + std::shared_ptr array; + std::shared_ptr type = TypeTraits::type_singleton(); + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "0", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "{}", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0.0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"0\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); +} + +TYPED_TEST_P(TestIntegers, OutOfBounds) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr array; + std::shared_ptr type = TypeTraits::type_singleton(); + + if (type->id() == Type::UINT64) { + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[18446744073709551616]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]", &array)); + } else if (type->id() == Type::INT64) { + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[9223372036854775808]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-9223372036854775809]", &array)); + } else if (std::is_signed::value) { + const auto lower = SafeSignedAdd(std::numeric_limits::min(), -1); + const auto upper = SafeSignedAdd(std::numeric_limits::max(), +1); + auto json_string = JSONArray(lower); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + } else { + const auto upper = static_cast(std::numeric_limits::max()) + 1; + auto json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]", &array)); + } +} + +REGISTER_TYPED_TEST_CASE_P(TestIntegers, Basics, Errors, OutOfBounds); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestIntegers, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestIntegers, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestIntegers, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestIntegers, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestIntegers, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestIntegers, UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestIntegers, UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestIntegers, UInt64Type); + +TEST(TestNull, Basics) { + std::shared_ptr type = null(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[null, null]", {nullptr, nullptr}); +} + +TEST(TestNull, Errors) { + std::shared_ptr type = null(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[NaN]", &array)); +} + +TEST(TestBoolean, Basics) { + std::shared_ptr type = boolean(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[false, true, false]", {false, true, false}); + AssertJSONArray(type, "[false, true, null]", {true, true, false}, + {false, true, false}); +} + +TEST(TestBoolean, Errors) { + std::shared_ptr type = boolean(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"true\"]", &array)); +} + +TEST(TestFloat, Basics) { + std::shared_ptr type = float32(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0f, 2.5f, -3.0e4f}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0f, INFINITY, -INFINITY, 0.0f}); + + // Check NaN separately as AssertArraysEqual simply memcmp's array contents + // and NaNs can have many bit representations. + ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + float value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestFloat, Errors) { + std::shared_ptr type = float32(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]", &array)); +} + +TEST(TestDouble, Basics) { + std::shared_ptr type = float64(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0, 2.5, -3.0e4}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0, INFINITY, -INFINITY, 0.0}); + + ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + double value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestDouble, Errors) { + std::shared_ptr type = float64(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]", &array)); +} + +TEST(TestString, Basics) { + // String type + std::shared_ptr type = utf8(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"\", \"foo\"]", {"", "foo"}); + AssertJSONArray(type, "[\"\", null]", {true, false}, {"", ""}); + // NUL character in string + std::string s = "some"; + s += '\x00'; + s += "char"; + AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); + // UTF8 sequence in string + AssertJSONArray(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"}); + + // Binary type + type = binary(); + AssertJSONArray(type, "[\"\", \"foo\", null]", + {true, true, false}, {"", "foo", ""}); + // Arbitrary binary (non-UTF8) sequence in string + s = "\xff\x9f"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); + // Bytes < 0x20 can be represented as JSON unicode escapes + s = '\x00'; + s += "\x1f"; + AssertJSONArray(type, "[\"\\u0000\\u001f\"]", {s}); +} + +TEST(TestString, Errors) { + std::shared_ptr type = utf8(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); +} + +TEST(TestFixedSizeBinary, Basics) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"foo\", \"bar\"]", + {"foo", "bar"}); + AssertJSONArray(type, "[null, \"foo\"]", + {false, true}, {"", "foo"}); + // Arbitrary binary (non-UTF8) sequence in string + std::string s = "\xff\x9f\xcc"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); +} + +TEST(TestFixedSizeBinary, Errors) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + // Invalid length + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"abcd\"]", &array)); +} + +TEST(TestDecimal, Basics) { + std::shared_ptr type = decimal(10, 4); + std::shared_ptr expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", \"-78.9000\"]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Append(Decimal128(1234567))); + ASSERT_OK(builder.Append(Decimal128(-789000))); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Append(Decimal128(1234567))); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestDecimal, Errors) { + std::shared_ptr type = decimal(10, 4); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[12.3456]", &array)); + // Bad scale + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.345\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.34560\"]", &array)); +} + +TEST(TestList, IntegerList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(int64()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[4, 5], [], [6]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestList, IntegerListErrors) { + std::shared_ptr type = list(int64()); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]", &array)); +} + +TEST(TestList, NullList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(null()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [null, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestList, IntegerListList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(list(uint8())); + std::shared_ptr offsets, values, nested, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); + ArrayFromVector({0, 2, 3}, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *nested, pool, &expected)); + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *nested, pool, &expected)); + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [null], [[null]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.Append()); + ASSERT_OK(list_builder.Finish(&expected)); + } +} + +TEST(TestStruct, SimpleStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr a, b, expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children; + + // Trivial + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({}, &a); + ArrayFromVector({}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + // Non-empty + ArrayFromVector({5, 6}, &a); + ArrayFromVector({true, false}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 2, children); + + ASSERT_OK(ArrayFromJSON(type, "[[5, true], [6, false]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + ASSERT_OK(ArrayFromJSON(type, "[{\"a\": 5, \"b\": true}, {\"b\": false, \"a\": 6}]", + &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + + // With nulls + is_valid = {false, true, false, false}; + ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); + is_valid = {false, false, true, false}; + ArrayFromVector(is_valid, {false, true, false, false}, &b); + children.assign({a, b}); + BitmapFromVector({false, true, true, true}, &null_bitmap); + expected = std::make_shared(type, 4, children, null_bitmap, 1); + + ASSERT_OK( + ArrayFromJSON(type, "[null, [5, null], [null, false], [null, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + // When using object notation, null members can be omitted + ASSERT_OK(ArrayFromJSON(type, "[null, {\"a\": 5, \"b\": null}, {\"b\": false}, {}]", + &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStruct, NestedStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto field_c = field("c", float64()); + std::shared_ptr nested_type = struct_({field_a, field_b}); + auto field_nested = field("nested", nested_type); + std::shared_ptr type = struct_({field_nested, field_c}); + std::shared_ptr expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children(2); + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({}, &children[0]); + ArrayFromVector({}, &children[1]); + children[0] = std::make_shared(nested_type, 0, children); + ArrayFromVector({}, &children[1]); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[[5, true], 1.5], [[6, false], -3e2]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({5, 6}, &children[0]); + ArrayFromVector({true, false}, &children[1]); + children[0] = std::make_shared(nested_type, 2, children); + ArrayFromVector({1.5, -300.0}, &children[1]); + expected = std::make_shared(type, 2, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [[5, null], null], [null, -3e2]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + is_valid = {false, true, false}; + ArrayFromVector(is_valid, {0, 5, 0}, &children[0]); + is_valid = {false, false, false}; + ArrayFromVector(is_valid, {false, false, false}, &children[1]); + BitmapFromVector({false, true, false}, &null_bitmap); + children[0] = std::make_shared(nested_type, 3, children, null_bitmap, 2); + is_valid = {false, false, true}; + ArrayFromVector(is_valid, {0.0, 0.0, -300.0}, &children[1]); + BitmapFromVector({false, true, true}, &null_bitmap); + expected = std::make_shared(type, 3, children, null_bitmap, 1); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStruct, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0, true]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, true, 1]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[true, 0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"b\": 0, \"a\": true}]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array)); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc new file mode 100644 index 0000000000000..047788ce0f5de --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -0,0 +1,540 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/ipc/json-internal.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/memory_pool.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; + +static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + +static Status JSONTypeError(const char* expected_type, rj::Type json_type) { + return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", + json_type); +} + +class Converter { + public: + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status AppendValue(const rj::Value& json_obj) = 0; + + virtual Status AppendNull() = 0; + + virtual Status AppendValues(const rj::Value& json_array) = 0; + + virtual std::shared_ptr builder() = 0; + + virtual Status Finish(std::shared_ptr* out) { + auto builder = this->builder(); + if (builder->length() == 0) { + // Make sure the builder was initialized + RETURN_NOT_OK(builder->Resize(1)); + } + return builder->Finish(out); + } + + protected: + std::shared_ptr type_; +}; + +Status GetConverter(const std::shared_ptr&, std::shared_ptr* out); + +// CRTP +template +class ConcreteConverter : public Converter { + public: + Status AppendValues(const rj::Value& json_array) override { + auto self = static_cast(this); + if (!json_array.IsArray()) { + return JSONTypeError("array", json_array.GetType()); + } + auto size = json_array.Size(); + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(self->AppendValue(json_array[i])); + } + return Status::OK(); + } +}; + +// TODO : dates and times? + +// ------------------------------------------------------------------------ +// Converter for null arrays + +class NullConverter final : public ConcreteConverter { + public: + explicit NullConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + return JSONTypeError("null", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for boolean arrays + +class BooleanConverter final : public ConcreteConverter { + public: + explicit BooleanConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsBool()) { + return builder_->Append(json_obj.GetBool()); + } + return JSONTypeError("boolean", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for int arrays + +template +class IntegerConverter final : public ConcreteConverter> { + using c_type = typename Type::c_type; + static constexpr auto is_signed = std::is_signed::value; + + public: + explicit IntegerConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared>(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + return AppendNumber(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + // Append signed integer value + template + typename std::enable_if::value, Status>::type AppendNumber( + const rj::Value& json_obj) { + if (json_obj.IsInt64()) { + int64_t v64 = json_obj.GetInt64(); + c_type v = static_cast(v64); + if (v == v64) { + return builder_->Append(v); + } else { + return Status::Invalid("Value ", v64, " out of bounds for ", + this->type_->ToString()); + } + } else { + return JSONTypeError("signed int", json_obj.GetType()); + } + } + + // Append unsigned integer value + template + typename std::enable_if::value, Status>::type AppendNumber( + const rj::Value& json_obj) { + if (json_obj.IsUint64()) { + uint64_t v64 = json_obj.GetUint64(); + c_type v = static_cast(v64); + if (v == v64) { + return builder_->Append(v); + } else { + return Status::Invalid("Value ", v64, " out of bounds for ", + this->type_->ToString()); + } + return builder_->Append(v); + } else { + return JSONTypeError("unsigned int", json_obj.GetType()); + } + } + + std::shared_ptr> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for float arrays + +template +class FloatConverter final : public ConcreteConverter> { + using c_type = typename Type::c_type; + + public: + explicit FloatConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared>(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsNumber()) { + c_type v = static_cast(json_obj.GetDouble()); + return builder_->Append(v); + } else { + return JSONTypeError("number", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for decimal arrays + +class DecimalConverter final : public ConcreteConverter { + public: + explicit DecimalConverter(const std::shared_ptr& type) { + this->type_ = type; + decimal_type_ = checked_cast(type.get()); + builder_ = std::make_shared(type); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsString()) { + int32_t precision, scale; + Decimal128 d; + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + RETURN_NOT_OK(Decimal128::FromString(view, &d, &precision, &scale)); + if (scale != decimal_type_->scale()) { + return Status::Invalid("Invalid scale for decimal: expected ", + decimal_type_->scale(), ", got ", scale); + } + return builder_->Append(d); + } + return JSONTypeError("decimal string", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + Decimal128Type* decimal_type_; +}; + +// ------------------------------------------------------------------------ +// Converter for binary and string arrays + +class StringConverter final : public ConcreteConverter { + public: + explicit StringConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for fixed-size binary arrays + +class FixedSizeBinaryConverter final + : public ConcreteConverter { + public: + explicit FixedSizeBinaryConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + if (view.length() != static_cast(builder_->byte_width())) { + std::stringstream ss; + ss << "Invalid string length " << view.length() << " in JSON input for " + << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for list arrays + +class ListConverter final : public ConcreteConverter { + public: + explicit ListConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + const auto& list_type = checked_cast(*type_); + RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = std::make_shared(default_memory_pool(), child_builder, type_); + return Status::OK(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + // Extend the child converter with this JSON array + return child_converter_->AppendValues(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + std::shared_ptr child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class StructConverter final : public ConcreteConverter { + public: + explicit StructConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + std::vector> child_builders; + for (const auto& field : type_->children()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + builder_ = std::make_shared(type_, default_memory_pool(), + std::move(child_builders)); + return Status::OK(); + } + + Status AppendNull() override { + for (auto& converter : child_converters_) { + RETURN_NOT_OK(converter->AppendNull()); + } + return builder_->AppendNull(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsArray()) { + auto size = json_obj.Size(); + auto expected_size = static_cast(type_->num_children()); + if (size != expected_size) { + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", size); + } + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + } + return builder_->Append(); + } + if (json_obj.IsObject()) { + auto remaining = json_obj.MemberCount(); + auto num_children = type_->num_children(); + for (int32_t i = 0; i < num_children; ++i) { + const auto& field = type_->child(i); + auto it = json_obj.FindMember(field->name()); + if (it != json_obj.MemberEnd()) { + --remaining; + RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + } else { + RETURN_NOT_OK(child_converters_[i]->AppendNull()); + } + } + if (remaining > 0) { + return Status::Invalid("Unexpected members in JSON object for type ", + type_->ToString()); + } + return builder_->Append(); + } + return JSONTypeError("array or object", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + std::vector> child_converters_; +}; + +// ------------------------------------------------------------------------ +// General conversion functions + +Status GetConverter(const std::shared_ptr& type, + std::shared_ptr* out) { + std::shared_ptr res; + +#define SIMPLE_CONVERTER_CASE(ID, CLASS) \ + case ID: \ + res = std::make_shared(type); \ + break; + + switch (type->id()) { + SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) + SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) + SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) + SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) + SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) + default: { + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); + } + } + +#undef SIMPLE_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +Status ArrayFromJSON(const std::shared_ptr& type, + const util::string_view& json_string, std::shared_ptr* out) { + std::shared_ptr converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); + } + + // The JSON document should be an array, append it + RETURN_NOT_OK(converter->AppendValues(json_doc)); + return converter->Finish(out); +} + +Status ArrayFromJSON(const std::shared_ptr& type, + const std::string& json_string, std::shared_ptr* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +Status ArrayFromJSON(const std::shared_ptr& type, const char* json_string, + std::shared_ptr* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-simple.h b/cpp/src/arrow/ipc/json-simple.h new file mode 100644 index 0000000000000..da6483ff1556f --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement a simple JSON representation format for arrays + +#ifndef ARROW_IPC_JSON_SIMPLE_H +#define ARROW_IPC_JSON_SIMPLE_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace ipc { +namespace internal { +namespace json { + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const std::string& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const util::string_view& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const char* json, + std::shared_ptr* out); + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_SIMPLE_H diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/json-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-json-test.cc rename to cpp/src/arrow/ipc/json-test.cc diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 394563c53c09d..61c242ca2dbbb 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -99,7 +99,7 @@ Status JsonWriter::WriteRecordBatch(const RecordBatch& batch) { class JsonReader::JsonReaderImpl { public: JsonReaderImpl(MemoryPool* pool, const std::shared_ptr& data) - : pool_(pool), data_(data) {} + : pool_(pool), data_(data), record_batches_(nullptr) {} Status ParseAndReadSchema() { doc_.Parse(reinterpret_cast(data_->data()), diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 724e6255cbddb..23709a4619207 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -63,6 +63,8 @@ class Message::MessageImpl { return Message::RECORD_BATCH; case flatbuf::MessageHeader_Tensor: return Message::TENSOR; + case flatbuf::MessageHeader_SparseTensor: + return Message::SPARSE_TENSOR; default: return Message::NONE; } @@ -153,10 +155,8 @@ Status Message::ReadFrom(const std::shared_ptr& metadata, io::InputStrea std::shared_ptr body; RETURN_NOT_OK(stream->Read(body_length, &body)); if (body->size() < body_length) { - std::stringstream ss; - ss << "Expected to be able to read " << body_length << " bytes for message body, got " - << body->size(); - return Status::IOError(ss.str()); + return Status::IOError("Expected to be able to read ", body_length, + " bytes for message body, got ", body->size()); } return Message::Open(metadata, body, out); @@ -171,10 +171,8 @@ Status Message::ReadFrom(const int64_t offset, const std::shared_ptr& me std::shared_ptr body; RETURN_NOT_OK(file->ReadAt(offset, body_length, &body)); if (body->size() < body_length) { - std::stringstream ss; - ss << "Expected to be able to read " << body_length << " bytes for message body, got " - << body->size(); - return Status::IOError(ss.str()); + return Status::IOError("Expected to be able to read ", body_length, + " bytes for message body, got ", body->size()); } return Message::Open(metadata, body, out); @@ -238,19 +236,16 @@ Status ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile RETURN_NOT_OK(file->ReadAt(offset, metadata_length, &buffer)); if (buffer->size() < metadata_length) { - std::stringstream ss; - ss << "Expected to read " << metadata_length << " metadata bytes but got " - << buffer->size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected to read ", metadata_length, + " metadata bytes but got ", buffer->size()); } int32_t flatbuffer_size = *reinterpret_cast(buffer->data()); if (flatbuffer_size + static_cast(sizeof(int32_t)) > metadata_length) { - std::stringstream ss; - ss << "flatbuffer size " << metadata_length << " invalid. File offset: " << offset - << ", metadata length: " << metadata_length; - return Status::Invalid(ss.str()); + return Status::Invalid("flatbuffer size ", metadata_length, + " invalid. File offset: ", offset, + ", metadata length: ", metadata_length); } auto metadata = SliceBuffer(buffer, 4, buffer->size() - 4); @@ -303,10 +298,8 @@ Status ReadMessage(io::InputStream* file, std::unique_ptr* message) { std::shared_ptr metadata; RETURN_NOT_OK(file->Read(message_length, &metadata)); if (metadata->size() != message_length) { - std::stringstream ss; - ss << "Expected to read " << message_length << " metadata bytes, but " - << "only read " << metadata->size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected to read ", message_length, " metadata bytes, but ", + "only read ", metadata->size()); } return Message::ReadFrom(metadata, file, message); diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 092a19ff9a0cf..760012d1a6878 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -70,7 +70,7 @@ constexpr int kMaxNestingDepth = 64; /// \brief An IPC message including metadata and body class ARROW_EXPORT Message { public: - enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR }; + enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR, SPARSE_TENSOR }; /// \brief Construct message, but do not validate /// diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ef189c8ae617a..da6711395f8ea 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -31,6 +31,7 @@ #include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -50,6 +51,7 @@ using DictionaryOffset = flatbuffers::Offset; using FieldOffset = flatbuffers::Offset; using KeyValueOffset = flatbuffers::Offset; using RecordBatchOffset = flatbuffers::Offset; +using SparseTensorOffset = flatbuffers::Offset; using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; @@ -443,9 +445,7 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type, return UnionToFlatBuffer(fbb, *value_type, children, dictionary_memo, offset); default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy - std::stringstream ss; - ss << "Unable to convert type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unable to convert type: ", type.ToString()); } return Status::OK(); } @@ -483,9 +483,7 @@ static Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type, break; default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy - std::stringstream ss; - ss << "Unable to convert type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unable to convert type: ", type.ToString()); } return Status::OK(); } @@ -785,6 +783,106 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, body_length, out); } +Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO; + const BufferMetadata& indices_metadata = buffers[0]; + flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); + *fb_sparse_index = flatbuf::CreateSparseTensorIndexCOO(fbb, &indices).Union(); + *num_buffers = 1; + return Status::OK(); +} + +Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; + const BufferMetadata& indptr_metadata = buffers[0]; + const BufferMetadata& indices_metadata = buffers[1]; + flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length); + flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); + *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, &indptr, &indices).Union(); + *num_buffers = 2; + return Status::OK(); +} + +Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { + switch (sparse_index.format_id()) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(MakeSparseTensorIndexCOO( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(MakeSparseMatrixIndexCSR( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + + default: + std::stringstream ss; + ss << "Unsupporoted sparse tensor format:: " << sparse_index.ToString() + << std::endl; + return Status::NotImplemented(ss.str()); + } + + return Status::OK(); +} + +Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + SparseTensorOffset* offset) { + flatbuf::Type fb_type_type; + Offset fb_type; + RETURN_NOT_OK( + TensorTypeToFlatbuffer(fbb, *sparse_tensor.type(), &fb_type_type, &fb_type)); + + using TensorDimOffset = flatbuffers::Offset; + std::vector dims; + for (int i = 0; i < sparse_tensor.ndim(); ++i) { + FBString name = fbb.CreateString(sparse_tensor.dim_name(i)); + dims.push_back(flatbuf::CreateTensorDim(fbb, sparse_tensor.shape()[i], name)); + } + + auto fb_shape = fbb.CreateVector(dims); + + flatbuf::SparseTensorIndex fb_sparse_index_type; + Offset fb_sparse_index; + size_t num_index_buffers = 0; + RETURN_NOT_OK(MakeSparseTensorIndex(fbb, *sparse_tensor.sparse_index(), buffers, + &fb_sparse_index_type, &fb_sparse_index, + &num_index_buffers)); + + const BufferMetadata& data_metadata = buffers[num_index_buffers]; + flatbuf::Buffer data(data_metadata.offset, data_metadata.length); + + const int64_t non_zero_length = sparse_tensor.non_zero_length(); + + *offset = + flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, non_zero_length, + fb_sparse_index_type, fb_sparse_index, &data); + + return Status::OK(); +} + +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out) { + FBB fbb; + SparseTensorOffset fb_sparse_tensor; + RETURN_NOT_OK( + MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor)); + return WriteFBMessage(fbb, flatbuf::MessageHeader_SparseTensor, + fb_sparse_tensor.Union(), body_length, out); +} + Status WriteDictionaryMessage(int64_t id, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, @@ -937,6 +1035,52 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type return TypeFromFlatbuffer(tensor->type_type(), tensor->type(), {}, type); } +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, + std::vector* dim_names, + int64_t* non_zero_length, + SparseTensorFormat::type* sparse_tensor_format_id) { + auto message = flatbuf::GetMessage(metadata.data()); + if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { + return Status::IOError("Header of flatbuffer-encoded Message is not SparseTensor."); + } + if (message->header() == nullptr) { + return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); + } + + auto sparse_tensor = reinterpret_cast(message->header()); + int ndim = static_cast(sparse_tensor->shape()->size()); + + for (int i = 0; i < ndim; ++i) { + auto dim = sparse_tensor->shape()->Get(i); + + shape->push_back(dim->size()); + auto fb_name = dim->name(); + if (fb_name == 0) { + dim_names->push_back(""); + } else { + dim_names->push_back(fb_name->str()); + } + } + + *non_zero_length = sparse_tensor->non_zero_length(); + + switch (sparse_tensor->sparseIndex_type()) { + case flatbuf::SparseTensorIndex_SparseTensorIndexCOO: + *sparse_tensor_format_id = SparseTensorFormat::COO; + break; + + case flatbuf::SparseTensorIndex_SparseMatrixIndexCSR: + *sparse_tensor_format_id = SparseTensorFormat::CSR; + break; + + default: + return Status::Invalid("Unrecognized sparse index type"); + } + + return TypeFromFlatbuffer(sparse_tensor->type_type(), sparse_tensor->type(), {}, type); +} + // ---------------------------------------------------------------------- // Implement message writing diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 152ca1367ec0e..6562382b878e6 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -33,6 +33,7 @@ #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" #include "arrow/memory_pool.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" namespace arrow { @@ -40,6 +41,7 @@ namespace arrow { class DataType; class Schema; class Tensor; +class SparseTensor; namespace flatbuf = org::apache::arrow::flatbuf; @@ -103,6 +105,12 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); +// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, + std::vector* dim_names, int64_t* length, + SparseTensorFormat::type* sparse_tensor_format_id); + /// Write a serialized message metadata with a length-prefix and padding to an /// 8-byte offset. Does not make assumptions about whether the stream is /// aligned already @@ -137,6 +145,10 @@ Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, std::shared_ptr* out); +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out); + Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, const std::vector& record_batches, DictionaryMemo* dictionary_memo, io::OutputStream* out); diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/read-write-benchmark.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-read-write-benchmark.cc rename to cpp/src/arrow/ipc/read-write-benchmark.cc diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc similarity index 86% rename from cpp/src/arrow/ipc/ipc-read-write-test.cc rename to cpp/src/arrow/ipc/read-write-test.cc index 3a723badf37d7..bc27386f34f30 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -38,6 +38,7 @@ #include "arrow/ipc/writer.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/test-util.h" @@ -844,6 +845,117 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { CheckTensorRoundTrip(tensor); } +class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture { + public: + void SetUp() { pool_ = default_memory_pool(); } + void TearDown() { io::MemoryMapFixture::TearDown(); } + + template + void CheckSparseTensorRoundTrip(const SparseTensorImpl& tensor) { + GTEST_FAIL(); + } +}; + +template <> +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorImpl& tensor) { + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + default_memory_pool())); + + const auto& sparse_index = checked_cast(*tensor.sparse_index()); + const int64_t indices_length = elem_size * sparse_index.indices()->size(); + const int64_t data_length = elem_size * tensor.non_zero_length(); + const int64_t expected_body_length = indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(*result)); +} + +template <> +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorImpl& tensor) { + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + default_memory_pool())); + + const auto& sparse_index = checked_cast(*tensor.sparse_index()); + const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); + const int64_t indices_length = elem_size * sparse_index.indices()->size(); + const int64_t data_length = elem_size * tensor.non_zero_length(); + const int64_t expected_body_length = indptr_length + indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(*result)); +} + +TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { + std::string path = "test-write-sparse-coo-tensor"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + + std::vector shape = {2, 3, 4}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + SparseTensorImpl st(t); + + CheckSparseTensorRoundTrip(st); +} + +TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { + std::string path = "test-write-sparse-csr-matrix"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + + std::vector shape = {4, 6}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + SparseTensorImpl st(t); + + CheckSparseTensorRoundTrip(st); +} + TEST(TestRecordBatchStreamReader, MalformedInput) { const std::string empty_str = ""; const std::string garbage_str = "12345678"; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 65f5d963e88db..1f04fad81743c 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -38,6 +38,7 @@ #include "arrow/ipc/message.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -225,9 +226,7 @@ class ArrayLoader { const int num_children = type.num_children(); if (num_children != 1) { - std::stringstream ss; - ss << "Wrong number of children: " << num_children; - return Status::Invalid(ss.str()); + return Status::Invalid("Wrong number of children: ", num_children); } return LoadChildren(type.children()); @@ -343,9 +342,7 @@ Status ReadDictionary(const Buffer& metadata, const DictionaryTypeMap& dictionar int64_t id = *dictionary_id = dictionary_batch->id(); auto it = dictionary_types.find(id); if (it == dictionary_types.end()) { - std::stringstream ss; - ss << "Do not have type metadata for dictionary with id: " << id; - return Status::KeyError(ss.str()); + return Status::KeyError("Do not have type metadata for dictionary with id: ", id); } std::vector> fields = {it->second}; @@ -372,10 +369,8 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect RETURN_NOT_OK(reader->ReadNextMessage(message)); if (!(*message) && !allow_null) { - std::stringstream ss; - ss << "Expected " << FormatMessageType(expected_type) - << " message in stream, was null or length 0"; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", FormatMessageType(expected_type), + " message in stream, was null or length 0"); } if ((*message) == nullptr) { @@ -383,10 +378,9 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect } if ((*message)->type() != expected_type) { - std::stringstream ss; - ss << "Message not expected type: " << FormatMessageType(expected_type) - << ", was: " << (*message)->type(); - return Status::IOError(ss.str()); + return Status::IOError( + "Message not expected type: ", FormatMessageType(expected_type), + ", was: ", (*message)->type()); } return Status::OK(); } @@ -506,15 +500,15 @@ Status RecordBatchStreamReader::ReadNext(std::shared_ptr* batch) { class RecordBatchFileReader::RecordBatchFileReaderImpl { public: - RecordBatchFileReaderImpl() { dictionary_memo_ = std::make_shared(); } + RecordBatchFileReaderImpl() : file_(NULLPTR), footer_offset_(0), footer_(NULLPTR) { + dictionary_memo_ = std::make_shared(); + } Status ReadFooter() { int magic_size = static_cast(strlen(kArrowMagicBytes)); if (footer_offset_ <= magic_size * 2 + 4) { - std::stringstream ss; - ss << "File is too small: " << footer_offset_; - return Status::Invalid(ss.str()); + return Status::Invalid("File is too small: ", footer_offset_); } std::shared_ptr buffer; @@ -523,9 +517,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { const int64_t expected_footer_size = magic_size + sizeof(int32_t); if (buffer->size() < expected_footer_size) { - std::stringstream ss; - ss << "Unable to read " << expected_footer_size << "from end of file"; - return Status::Invalid(ss.str()); + return Status::Invalid("Unable to read ", expected_footer_size, "from end of file"); } if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { @@ -709,6 +701,12 @@ Status ReadSchema(io::InputStream* stream, std::shared_ptr* out) { return Status::OK(); } +Status ReadSchema(const Message& message, std::shared_ptr* out) { + std::shared_ptr reader; + DictionaryMemo dictionary_memo; + return internal::GetSchema(message.header(), dictionary_memo, &*out); +} + Status ReadRecordBatch(const std::shared_ptr& schema, io::InputStream* file, std::shared_ptr* out) { std::unique_ptr message; @@ -735,5 +733,123 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { return Status::OK(); } +namespace { + +Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, + int64_t non_zero_length, io::RandomAccessFile* file, + std::shared_ptr* out) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); + auto* indices_buffer = sparse_index->indicesBuffer(); + std::shared_ptr indices_data; + RETURN_NOT_OK( + file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + std::vector shape({non_zero_length, ndim}); + const int64_t elsize = sizeof(int64_t); + std::vector strides({elsize, elsize * non_zero_length}); + *out = std::make_shared( + std::make_shared(indices_data, shape, strides)); + return Status::OK(); +} + +Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, + int64_t non_zero_length, io::RandomAccessFile* file, + std::shared_ptr* out) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); + + auto* indptr_buffer = sparse_index->indptrBuffer(); + std::shared_ptr indptr_data; + RETURN_NOT_OK( + file->ReadAt(indptr_buffer->offset(), indptr_buffer->length(), &indptr_data)); + + auto* indices_buffer = sparse_index->indicesBuffer(); + std::shared_ptr indices_data; + RETURN_NOT_OK( + file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + + std::vector indptr_shape({ndim + 1}); + std::vector indices_shape({non_zero_length}); + *out = std::make_shared( + std::make_shared(indptr_data, indptr_shape), + std::make_shared(indices_data, indices_shape)); + return Status::OK(); +} + +Status MakeSparseTensorWithSparseCOOIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t non_zero_length, + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); + return Status::OK(); +} + +Status MakeSparseTensorWithSparseCSRIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t non_zero_length, + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); + return Status::OK(); +} + +} // namespace + +Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, + std::shared_ptr* out) { + std::shared_ptr type; + std::vector shape; + std::vector dim_names; + int64_t non_zero_length; + SparseTensorFormat::type sparse_tensor_format_id; + + RETURN_NOT_OK(internal::GetSparseTensorMetadata( + metadata, &type, &shape, &dim_names, &non_zero_length, &sparse_tensor_format_id)); + + auto message = flatbuf::GetMessage(metadata.data()); + auto sparse_tensor = reinterpret_cast(message->header()); + const flatbuf::Buffer* buffer = sparse_tensor->data(); + DCHECK(BitUtil::IsMultipleOf8(buffer->offset())) + << "Buffer of sparse index data " + << "did not start on 8-byte aligned offset: " << buffer->offset(); + + std::shared_ptr data; + RETURN_NOT_OK(file->ReadAt(buffer->offset(), buffer->length(), &data)); + + std::shared_ptr sparse_index; + switch (sparse_tensor_format_id) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, + &sparse_index)); + return MakeSparseTensorWithSparseCOOIndex( + type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), + non_zero_length, data, out); + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, + &sparse_index)); + return MakeSparseTensorWithSparseCSRIndex( + type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), + non_zero_length, data, out); + + default: + return Status::Invalid("Unsupported sparse index format"); + } +} + +Status ReadSparseTensor(const Message& message, std::shared_ptr* out) { + io::BufferReader buffer_reader(message.body()); + return ReadSparseTensor(*message.metadata(), &buffer_reader, out); +} + +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out) { + std::unique_ptr message; + RETURN_NOT_OK(ReadContiguousPayload(file, &message)); + DCHECK_EQ(message->type(), Message::SPARSE_TENSOR); + io::BufferReader buffer_reader(message->body()); + return ReadSparseTensor(*message->metadata(), &buffer_reader, out); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 942664d6f2269..641de3eaf7b41 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -33,6 +33,7 @@ class Buffer; class Schema; class Status; class Tensor; +class SparseTensor; namespace io { @@ -174,6 +175,14 @@ class ARROW_EXPORT RecordBatchFileReader { ARROW_EXPORT Status ReadSchema(io::InputStream* stream, std::shared_ptr* out); +/// \brief Read Schema from encapsulated Message +/// +/// \param[in] message a message instance containing metadata +/// \param[out] out the resulting Schema +/// \return Status +ARROW_EXPORT +Status ReadSchema(const Message& message, std::shared_ptr* out); + /// Read record batch as encapsulated IPC message with metadata size prefix and /// header /// @@ -235,6 +244,22 @@ Status ReadTensor(io::InputStream* file, std::shared_ptr* out); ARROW_EXPORT Status ReadTensor(const Message& message, std::shared_ptr* out); +/// \brief EXPERIMETNAL: Read arrow::SparseTensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(const Message& message, std::shared_ptr* out); + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 3d3355dfe17fd..1eb91998b5a93 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "arrow/array.h" @@ -33,6 +34,7 @@ #include "arrow/ipc/util.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" @@ -522,6 +524,15 @@ Status WriteIpcPayload(const IpcPayload& payload, io::OutputStream* dst, return Status::OK(); } +Status GetSchemaPayload(const Schema& schema, MemoryPool* pool, + DictionaryMemo* dictionary_memo, IpcPayload* out) { + out->type = Message::Type::SCHEMA; + out->body_buffers.clear(); + out->body_length = 0; + RETURN_NOT_OK(SerializeSchema(schema, pool, &out->metadata)); + return WriteSchemaMessage(schema, dictionary_memo, &out->metadata); +} + Status GetRecordBatchPayload(const RecordBatch& batch, MemoryPool* pool, IpcPayload* out) { RecordBatchSerializer writer(pool, 0, kMaxNestingDepth, true, out); @@ -671,6 +682,105 @@ Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, return Status::OK(); } +namespace internal { + +class SparseTensorSerializer { + public: + SparseTensorSerializer(int64_t buffer_start_offset, IpcPayload* out) + : out_(out), buffer_start_offset_(buffer_start_offset) {} + + ~SparseTensorSerializer() = default; + + Status VisitSparseIndex(const SparseIndex& sparse_index) { + switch (sparse_index.format_id()) { + case SparseTensorFormat::COO: + RETURN_NOT_OK( + VisitSparseCOOIndex(checked_cast(sparse_index))); + break; + + case SparseTensorFormat::CSR: + RETURN_NOT_OK( + VisitSparseCSRIndex(checked_cast(sparse_index))); + break; + + default: + std::stringstream ss; + ss << "Unable to convert type: " << sparse_index.ToString() << std::endl; + return Status::NotImplemented(ss.str()); + } + + return Status::OK(); + } + + Status SerializeMetadata(const SparseTensor& sparse_tensor) { + return WriteSparseTensorMessage(sparse_tensor, out_->body_length, buffer_meta_, + &out_->metadata); + } + + Status Assemble(const SparseTensor& sparse_tensor) { + if (buffer_meta_.size() > 0) { + buffer_meta_.clear(); + out_->body_buffers.clear(); + } + + RETURN_NOT_OK(VisitSparseIndex(*sparse_tensor.sparse_index())); + out_->body_buffers.emplace_back(sparse_tensor.data()); + + int64_t offset = buffer_start_offset_; + buffer_meta_.reserve(out_->body_buffers.size()); + + for (size_t i = 0; i < out_->body_buffers.size(); ++i) { + const Buffer* buffer = out_->body_buffers[i].get(); + int64_t size = buffer->size(); + int64_t padding = BitUtil::RoundUpToMultipleOf8(size) - size; + buffer_meta_.push_back({offset, size + padding}); + offset += size + padding; + } + + out_->body_length = offset - buffer_start_offset_; + DCHECK(BitUtil::IsMultipleOf8(out_->body_length)); + + return SerializeMetadata(sparse_tensor); + } + + private: + Status VisitSparseCOOIndex(const SparseCOOIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + + Status VisitSparseCSRIndex(const SparseCSRIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indptr()->data()); + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + + IpcPayload* out_; + + std::vector buffer_meta_; + + int64_t buffer_start_offset_; +}; + +Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool, + IpcPayload* out) { + SparseTensorSerializer writer(0, out); + return writer.Assemble(sparse_tensor); +} + +} // namespace internal + +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool) { + internal::IpcPayload payload; + internal::SparseTensorSerializer writer(0, &payload); + RETURN_NOT_OK(writer.Assemble(sparse_tensor)); + + *body_length = payload.body_length; + return internal::WriteIpcPayload(payload, dst, metadata_length); +} + Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr& dictionary, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, MemoryPool* pool) { @@ -772,7 +882,10 @@ class SchemaWriter : public StreamBookKeeper { public: SchemaWriter(const Schema& schema, DictionaryMemo* dictionary_memo, MemoryPool* pool, io::OutputStream* sink) - : StreamBookKeeper(sink), schema_(schema), dictionary_memo_(dictionary_memo) {} + : StreamBookKeeper(sink), + pool_(pool), + schema_(schema), + dictionary_memo_(dictionary_memo) {} Status WriteSchema() { #ifndef NDEBUG diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index a1c711146efe8..5b099d59c0ef0 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -30,12 +30,14 @@ namespace arrow { class Buffer; +class DictionaryMemo; class MemoryPool; class RecordBatch; class Schema; class Status; class Table; class Tensor; +class SparseTensor; namespace io { @@ -53,7 +55,9 @@ class ARROW_EXPORT RecordBatchWriter { /// \brief Write a record batch to the stream /// - /// \param allow_64bit boolean permitting field lengths exceeding INT32_MAX + /// \param[in] batch the record batch to write to the stream + /// \param[in] allow_64bit if true, allow field lengths that don't fit + /// in a signed 32-bit int /// \return Status virtual Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) = 0; @@ -160,6 +164,7 @@ class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { /// \param[out] metadata_length the size of the length-prefixed flatbuffer /// including padding to a 64-byte boundary /// \param[out] body_length the size of the contiguous buffer block plus +/// \param[in] pool the memory pool to allocate memory from /// \param[in] max_recursion_depth the maximum permitted nesting schema depth /// \param[in] allow_64bit permit field lengths exceeding INT32_MAX. May not be /// readable by other Arrow implementations @@ -172,7 +177,9 @@ class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { /// prefixed by its size, followed by each of the memory buffers in the batch /// written end to end (with appropriate alignment and padding): /// -/// +/// \code +/// +/// \endcode /// /// Finally, the absolute offsets (relative to the start of the output stream) /// to the end of the body and end of the metadata / data header (suffixed by @@ -253,22 +260,40 @@ ARROW_EXPORT Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, std::unique_ptr* out); -/// \brief Write arrow::Tensor as a contiguous message. The metadata and body -/// are written assuming 64-byte alignment. It is the user's responsibility to -/// ensure that the OutputStream has been aligned to a 64-byte multiple before -/// writing the message. +/// \brief Write arrow::Tensor as a contiguous message. +/// +/// The metadata and body are written assuming 64-byte alignment. It is the +/// user's responsibility to ensure that the OutputStream has been aligned +/// to a 64-byte multiple before writing the message. +/// +/// The message is written out as followed: +/// \code +/// +/// \endcode /// /// \param[in] tensor the Tensor to write /// \param[in] dst the OutputStream to write to /// \param[out] metadata_length the actual metadata length, including padding /// \param[out] body_length the acutal message body length /// \return Status -/// -/// ARROW_EXPORT Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); +// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, +// sparse index, and body are written assuming 64-byte alignment. It is the +// user's responsibility to ensure that the OutputStream has been aligned +// to a 64-byte multiple before writing the message. +// +// \param[in] tensor the SparseTensor to write +// \param[in] dst the OutputStream to write to +// \param[out] metadata_length the actual metadata length, including padding +// \param[out] body_length the actual message body length +ARROW_EXPORT +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool); + namespace internal { // These internal APIs may change without warning or deprecation @@ -289,6 +314,17 @@ ARROW_EXPORT Status GetDictionaryPayloads(const Schema& schema, std::vector>* out); +/// \brief Compute IpcPayload for the given schema +/// \param[in] schema the Schema that is being serialized +/// \param[in,out] pool for any required temporary memory allocations +/// \param[in,out] dictionary_memo class for tracking dictionaries and assigning +/// dictionary ids +/// \param[out] out the returned IpcPayload +/// \return Status +ARROW_EXPORT +Status GetSchemaPayload(const Schema& schema, MemoryPool* pool, + DictionaryMemo* dictionary_memo, IpcPayload* out); + /// \brief Compute IpcPayload for the given record batch /// \param[in] batch the RecordBatch that is being serialized /// \param[in,out] pool for any required temporary memory allocations diff --git a/cpp/src/arrow/memory_pool-test.h b/cpp/src/arrow/memory_pool-test.h index 34523a181ba1e..fc86d943ec116 100644 --- a/cpp/src/arrow/memory_pool-test.h +++ b/cpp/src/arrow/memory_pool-test.h @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 0a27141b447f7..3e0366a19da41 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -17,18 +17,16 @@ #include "arrow/memory_pool.h" -#include -#include -#include -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #include #include #include // IWYU pragma: keep #include "arrow/status.h" -#include "arrow/util/logging.h" +#include "arrow/util/logging.h" // IWYU pragma: keep #ifdef ARROW_JEMALLOC // Needed to support jemalloc 3 and 4 @@ -42,6 +40,11 @@ namespace arrow { constexpr size_t kAlignment = 64; namespace { + +// A static piece of memory for 0-size allocations, so as to return +// an aligned non-null pointer. +alignas(kAlignment) static uint8_t zero_size_area[1]; + // Allocate memory according to the alignment requirements for Arrow // (as of May 2016 64 bytes) Status AllocateAligned(int64_t size, uint8_t** out) { @@ -49,6 +52,10 @@ Status AllocateAligned(int64_t size, uint8_t** out) { if (size < 0) { return Status::Invalid("negative malloc size"); } + if (size == 0) { + *out = zero_size_area; + return Status::OK(); + } if (static_cast(size) >= std::numeric_limits::max()) { return Status::CapacityError("malloc size overflows size_t"); } @@ -57,35 +64,86 @@ Status AllocateAligned(int64_t size, uint8_t** out) { *out = reinterpret_cast(_aligned_malloc(static_cast(size), kAlignment)); if (!*out) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } #elif defined(ARROW_JEMALLOC) - *out = reinterpret_cast(mallocx( - std::max(static_cast(size), kAlignment), MALLOCX_ALIGN(kAlignment))); + *out = reinterpret_cast( + mallocx(static_cast(size), MALLOCX_ALIGN(kAlignment))); if (*out == NULL) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } #else const int result = posix_memalign(reinterpret_cast(out), kAlignment, static_cast(size)); if (result == ENOMEM) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } if (result == EINVAL) { - std::stringstream ss; - ss << "invalid alignment parameter: " << kAlignment; - return Status::Invalid(ss.str()); + return Status::Invalid("invalid alignment parameter: ", kAlignment); } #endif return Status::OK(); } + +void DeallocateAligned(uint8_t* ptr, int64_t size) { + if (ptr == zero_size_area) { + DCHECK_EQ(size, 0); + } else { +#ifdef _WIN32 + _aligned_free(ptr); +#elif defined(ARROW_JEMALLOC) + dallocx(ptr, MALLOCX_ALIGN(kAlignment)); +#else + std::free(ptr); +#endif + } +} + +Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) { + uint8_t* previous_ptr = *ptr; + if (previous_ptr == zero_size_area) { + DCHECK_EQ(old_size, 0); + return AllocateAligned(new_size, ptr); + } + if (new_size == 0) { + DeallocateAligned(previous_ptr, old_size); + *ptr = zero_size_area; + return Status::OK(); + } +#ifdef ARROW_JEMALLOC + if (new_size < 0) { + return Status::Invalid("negative realloc size"); + } + if (static_cast(new_size) >= std::numeric_limits::max()) { + return Status::CapacityError("realloc overflows size_t"); + } + *ptr = reinterpret_cast( + rallocx(*ptr, static_cast(new_size), MALLOCX_ALIGN(kAlignment))); + if (*ptr == NULL) { + *ptr = previous_ptr; + return Status::OutOfMemory("realloc of size ", new_size, " failed"); + } +#else + // Note: We cannot use realloc() here as it doesn't guarantee alignment. + + // Allocate new chunk + uint8_t* out = nullptr; + RETURN_NOT_OK(AllocateAligned(new_size, &out)); + DCHECK(out); + // Copy contents and release old memory chunk + memcpy(out, *ptr, static_cast(std::min(new_size, old_size))); +#ifdef _WIN32 + _aligned_free(*ptr); +#else + std::free(*ptr); +#endif // defined(_MSC_VER) + *ptr = out; +#endif // defined(ARROW_JEMALLOC) + + return Status::OK(); +} + } // namespace MemoryPool::MemoryPool() {} @@ -109,38 +167,7 @@ class DefaultMemoryPool : public MemoryPool { } Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { -#ifdef ARROW_JEMALLOC - uint8_t* previous_ptr = *ptr; - if (new_size < 0) { - return Status::Invalid("negative realloc size"); - } - if (static_cast(new_size) >= std::numeric_limits::max()) { - return Status::CapacityError("realloc overflows size_t"); - } - *ptr = reinterpret_cast( - rallocx(*ptr, static_cast(new_size), MALLOCX_ALIGN(kAlignment))); - if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; - *ptr = previous_ptr; - return Status::OutOfMemory(ss.str()); - } -#else - // Note: We cannot use realloc() here as it doesn't guarantee alignment. - - // Allocate new chunk - uint8_t* out = nullptr; - RETURN_NOT_OK(AllocateAligned(new_size, &out)); - DCHECK(out); - // Copy contents and release old memory chunk - memcpy(out, *ptr, static_cast(std::min(new_size, old_size))); -#ifdef _WIN32 - _aligned_free(*ptr); -#else - std::free(*ptr); -#endif // defined(_MSC_VER) - *ptr = out; -#endif // defined(ARROW_JEMALLOC) + RETURN_NOT_OK(ReallocateAligned(old_size, new_size, ptr)); stats_.UpdateAllocatedBytes(new_size - old_size); return Status::OK(); @@ -149,13 +176,8 @@ class DefaultMemoryPool : public MemoryPool { int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } void Free(uint8_t* buffer, int64_t size) override { -#ifdef _WIN32 - _aligned_free(buffer); -#elif defined(ARROW_JEMALLOC) - dallocx(buffer, MALLOCX_ALIGN(kAlignment)); -#else - std::free(buffer); -#endif + DeallocateAligned(buffer, size); + stats_.UpdateAllocatedBytes(-size); } diff --git a/cpp/src/arrow/memory_pool.h b/cpp/src/arrow/memory_pool.h index 49cd4c7efc3ed..8499b6f35d400 100644 --- a/cpp/src/arrow/memory_pool.h +++ b/cpp/src/arrow/memory_pool.h @@ -142,6 +142,7 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool { std::unique_ptr impl_; }; +/// Return the process-wide default memory pool. ARROW_EXPORT MemoryPool* default_memory_pool(); #ifdef ARROW_NO_DEFAULT_MEMORY_POOL diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 482bc4370fdca..8696efc735b8a 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -26,12 +26,10 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/util/decimal.h" namespace arrow { @@ -163,16 +161,7 @@ TEST_F(TestPrettyPrint, StructTypeBasic) { auto simple_2 = field("two", int32()); auto simple_struct = struct_({simple_1, simple_2}); - auto int_builder_1 = std::make_shared(); - auto int_builder_2 = std::make_shared(); - StructBuilder builder(simple_struct, default_memory_pool(), - {int_builder_1, int_builder_2}); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->Append(11)); - ASSERT_OK(int_builder_2->Append(22)); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(simple_struct, "[[11, 22]]"); static const char* ex = R"expected(-- is_valid: all not null -- child 0 type: int32 @@ -202,22 +191,7 @@ TEST_F(TestPrettyPrint, StructTypeAdvanced) { auto simple_2 = field("two", int32()); auto simple_struct = struct_({simple_1, simple_2}); - auto int_builder_1 = std::make_shared(); - auto int_builder_2 = std::make_shared(); - StructBuilder builder(simple_struct, default_memory_pool(), - {int_builder_1, int_builder_2}); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->Append(11)); - ASSERT_OK(int_builder_2->Append(22)); - ASSERT_OK(builder.AppendNull()); - ASSERT_OK(int_builder_1->AppendNull()); - ASSERT_OK(int_builder_2->AppendNull()); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->AppendNull()); - ASSERT_OK(int_builder_2->Append(33)); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(simple_struct, "[[11, 22], null, [null, 33]]"); static const char* ex = R"expected(-- is_valid: [ @@ -251,24 +225,9 @@ TEST_F(TestPrettyPrint, BinaryType) { } TEST_F(TestPrettyPrint, ListType) { - Int64Builder* int_builder = new Int64Builder(); - ListBuilder list_builder(default_memory_pool(), - std::unique_ptr(int_builder)); - - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.Append(false)); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->Append(4)); - ASSERT_OK(int_builder->Append(6)); - ASSERT_OK(int_builder->Append(7)); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->Append(2)); - ASSERT_OK(int_builder->Append(3)); + auto list_type = list(int64()); + auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); - std::shared_ptr array; - ASSERT_OK(list_builder.Finish(&array)); static const char* ex = R"expected([ [ null @@ -318,18 +277,11 @@ TEST_F(TestPrettyPrint, ListType) { TEST_F(TestPrettyPrint, FixedSizeBinaryType) { std::vector is_valid = {true, true, false, true, false}; - std::vector values = {"foo", "bar", "baz"}; - std::shared_ptr array; auto type = fixed_size_binary(3); - FixedSizeBinaryBuilder builder(type); + auto array = ArrayFromJSON(type, "[\"foo\", \"bar\", null, \"baz\"]"); - ASSERT_OK(builder.Append(values[0])); - ASSERT_OK(builder.Append(values[1])); - ASSERT_OK(builder.Append(values[2])); - ASSERT_OK(builder.Finish(&array)); - - static const char* ex = "[\n 666F6F,\n 626172,\n 62617A\n]"; + static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A\n]"; CheckArray(*array, {0, 10}, ex); static const char* ex_2 = " [\n 666F6F,\n ...\n 62617A\n ]"; CheckArray(*array, {2, 1}, ex_2); @@ -340,19 +292,7 @@ TEST_F(TestPrettyPrint, Decimal128Type) { int32_t s = 4; auto type = decimal(p, s); - - Decimal128Builder builder(type); - Decimal128 val; - - ASSERT_OK(Decimal128::FromString("123.4567", &val)); - ASSERT_OK(builder.Append(val)); - - ASSERT_OK(Decimal128::FromString("456.7891", &val)); - ASSERT_OK(builder.Append(val)); - ASSERT_OK(builder.AppendNull()); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(type, "[\"123.4567\", \"456.7891\", null]"); static const char* ex = "[\n 123.4567,\n 456.7891,\n null\n]"; CheckArray(*array, {0}, ex); @@ -392,11 +332,8 @@ TEST_F(TestPrettyPrint, DictionaryType) { } TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); - ChunkedArray chunked_array({array}); + auto array = ArrayFromJSON(int32(), "[0, 1, null, 3, null]"); + ChunkedArray chunked_array(array); static const char* expected = R"expected([ [ @@ -432,11 +369,8 @@ TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { } TEST_F(TestPrettyPrint, ColumnPrimitiveType) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); std::shared_ptr int_field = field("column", int32()); + auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); Column column(int_field, ArrayVector({array})); static const char* expected = R"expected(column: int32 @@ -475,11 +409,8 @@ TEST_F(TestPrettyPrint, ColumnPrimitiveType) { } TEST_F(TestPrettyPrint, TablePrimitive) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); std::shared_ptr int_field = field("column", int32()); + auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); std::shared_ptr column = std::make_shared(int_field, ArrayVector({array})); std::shared_ptr table_schema = schema({int_field}); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index ec23bfb00fcde..c524039c3e86a 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -19,7 +19,7 @@ #include #include #include -#include +#include // IWYU pragma: keep #include #include #include diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index fde6c293f9b68..ca50bc0bc993c 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -21,14 +21,17 @@ #include #include -#include "arrow/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { class Array; +class Column; class ChunkedArray; +class RecordBatch; +class Schema; class Status; +class Table; struct PrettyPrintOptions { PrettyPrintOptions(int indent_arg, int window_arg = 10, int indent_size_arg = 2, diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 7f4603ae5dfaf..7f1a0b5086e0b 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -22,7 +22,10 @@ find_package(PythonLibsNew REQUIRED) find_package(NumPy REQUIRED) +add_custom_target(arrow_python-all) add_custom_target(arrow_python) +add_custom_target(arrow_python-tests) +add_dependencies(arrow_python-all arrow_python arrow_python-tests) set(ARROW_PYTHON_SRCS arrow_to_pandas.cc @@ -74,6 +77,8 @@ ADD_ARROW_LIB(arrow_python EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" ) +add_dependencies(arrow_python ${ARROW_PYTHON_LIBRARIES}) + foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_PYTHON_EXPORTING) @@ -91,37 +96,10 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") COMPILE_FLAGS -Wno-parentheses-equality) endif() -install(FILES - api.h - arrow_to_pandas.h - benchmark.h - common.h - config.h - decimal.h - deserialize.h - helpers.h - inference.h - init.h - io.h - iterators.h - numpy_convert.h - numpy_interop.h - numpy_to_arrow.h - python_to_arrow.h - platform.h - pyarrow.h - serialize.h - type_traits.h - visibility.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") +ARROW_INSTALL_ALL_HEADERS("arrow/python") # pkg-config support -configure_file(arrow-python.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-python.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-python.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-python") # ---------------------------------------------------------------------- @@ -130,7 +108,7 @@ if (ARROW_BUILD_TESTS) util/test_main.cc) target_link_libraries(arrow_python_test_main - gtest_static) + ${GTEST_LIBRARY}) target_include_directories(arrow_python_test_main SYSTEM PUBLIC ${ARROW_PYTHON_INCLUDES}) @@ -157,6 +135,6 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}" EXTRA_LINK_LIBS ${PYTHON_LIBRARIES} EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" - LABELS "arrow_python" + LABELS "arrow_python-tests" NO_VALGRIND) endif() diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 3e04f2727ed51..8aa0bf74b7b27 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -36,9 +36,11 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parallel.h" +#include "arrow/util/string_view.h" #include "arrow/visitor_inline.h" #include "arrow/compute/api.h" @@ -75,21 +77,21 @@ template struct WrapBytes {}; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyUnicode_FromStringAndSize(data, length); } }; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } }; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } @@ -216,7 +218,7 @@ class PandasBlock { CATEGORICAL }; - PandasBlock(PandasOptions options, int64_t num_rows, int num_columns) + PandasBlock(const PandasOptions& options, int64_t num_rows, int num_columns) : num_rows_(num_rows), num_columns_(num_columns), options_(options) {} virtual ~PandasBlock() {} @@ -301,8 +303,8 @@ inline const T* GetPrimitiveValues(const Array& arr) { } template -inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& data, - double* out_values) { +inline void ConvertIntegerWithNulls(const PandasOptions& options, + const ChunkedArray& data, double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const T* in_values = GetPrimitiveValues(arr); @@ -315,8 +317,8 @@ inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& d } template -inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedArray& data, - T* out_values) { +inline void ConvertIntegerNoNullsSameType(const PandasOptions& options, + const ChunkedArray& data, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); if (arr.length() > 0) { @@ -328,8 +330,8 @@ inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedAr } template -inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& data, - OutType* out_values) { +inline void ConvertIntegerNoNullsCast(const PandasOptions& options, + const ChunkedArray& data, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const InType* in_values = GetPrimitiveValues(arr); @@ -339,8 +341,8 @@ inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& } } -static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +static Status ConvertBooleanWithNulls(const PandasOptions& options, + const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -363,7 +365,7 @@ static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& return Status::OK(); } -static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& data, +static void ConvertBooleanNoNulls(const PandasOptions& options, const ChunkedArray& data, uint8_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -373,59 +375,106 @@ static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& dat } } -template -static Status ConvertIntegerObjects(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - PyAcquireGIL lock; - constexpr bool is_signed = std::is_signed::value; - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const auto* in_values = GetPrimitiveValues(arr); - - for (int i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - *out_values++ = is_signed ? PyLong_FromLongLong(in_values[i]) - : PyLong_FromUnsignedLongLong(in_values[i]); - RETURN_IF_PYERROR(); - } +// Generic Array -> PyObject** converter that handles object deduplication, if +// requested +template +inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, + PyObject** out_values) { + const bool has_nulls = arr.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); } + ++out_values; } return Status::OK(); } -template -inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +template +struct MemoizationTraits { + using Scalar = typename T::c_type; +}; + +template +struct MemoizationTraits> { + // For binary, we memoize string_view as a scalar value to avoid having to + // unnecessarily copy the memory into the memo table data structure + using Scalar = util::string_view; +}; + +template +inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, + WrapFunction&& wrap_func, PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; + using Scalar = typename MemoizationTraits::Scalar; + PyAcquireGIL lock; + ::arrow::internal::ScalarMemoTable memo_table; + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index = memo_table.GetOrInsert(value); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + + auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - auto view = arr.GetView(i); - *out_values = WrapBytes::Wrap(view.data(), view.length()); - if (*out_values == nullptr) { - PyErr_Clear(); - std::stringstream ss; - ss << "Wrapping " << view << " failed"; - return Status::UnknownError(ss.str()); - } - } - ++out_values; + if (options.deduplicate_objects) { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values)); + } else { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values)); } + out_values += arr.length(); } return Status::OK(); } -inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertIntegerObjects(const PandasOptions& options, + const ChunkedArray& data, PyObject** out_values) { + using T = typename Type::c_type; + auto WrapValue = [](T value, PyObject** out) { + *out = std::is_signed::value ? PyLong_FromLongLong(value) + : PyLong_FromUnsignedLongLong(value); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); +} + +template +inline Status ConvertBinaryLike(const PandasOptions& options, const ChunkedArray& data, + PyObject** out_values) { + auto WrapValue = [](const util::string_view& view, PyObject** out) { + *out = WrapBytes::Wrap(view.data(), view.length()); + if (*out == nullptr) { + PyErr_Clear(); + return Status::UnknownError("Wrapping ", view, " failed"); + } + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); +} + +inline Status ConvertNulls(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -441,7 +490,7 @@ inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, +inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; if (data.num_chunks() <= 0) { @@ -505,7 +554,8 @@ inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, } template -inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr& col, +inline Status ConvertListsLike(const PandasOptions& options, + const std::shared_ptr& col, PyObject** out_values) { const ChunkedArray& data = *col->data().get(); const auto& list_type = checked_cast(*col->type()); @@ -606,69 +656,40 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) } } -template -static Status ConvertDates(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertDates(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - - PyAcquireGIL lock; - OwnedRef date_ref; - - PyDateTime_IMPORT; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - auto type = std::dynamic_pointer_cast(arr.type()); - DCHECK(type); - - const DateUnit unit = type->unit(); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - RETURN_NOT_OK(PyDate_from_int(arr.Value(i), unit, out_values++)); - RETURN_IF_PYERROR(); - } - } + { + PyAcquireGIL lock; + PyDateTime_IMPORT; } - - return Status::OK(); + auto WrapValue = [](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(PyDate_from_int(value, Type::UNIT, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); } -template -static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertTimes(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - - PyAcquireGIL lock; - OwnedRef time_ref; - - PyDateTime_IMPORT; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - auto type = std::dynamic_pointer_cast(arr.type()); - DCHECK(type); - - const TimeUnit::type unit = type->unit(); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - RETURN_NOT_OK(PyTime_from_int(arr.Value(i), unit, out_values++)); - RETURN_IF_PYERROR(); - } - } + { + PyAcquireGIL lock; + PyDateTime_IMPORT; } - return Status::OK(); + const TimeUnit::type unit = checked_cast(*data.type()).unit(); + + auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(PyTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); } -static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, +static Status ConvertDecimals(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; OwnedRef decimal; @@ -717,21 +738,21 @@ class ObjectBlock : public PandasBlock { if (type == Type::BOOL) { RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer)); } else if (type == Type::UINT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::BINARY) { RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::STRING) { @@ -773,18 +794,16 @@ class ObjectBlock : public PandasBlock { CONVERTLISTSLIKE_CASE(ListType, LIST) CONVERTLISTSLIKE_CASE(NullType, NA) default: { - std::stringstream ss; - ss << "Not implemented type for conversion from List to Pandas ObjectBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "Not implemented type for conversion from List to Pandas ObjectBlock: ", + list_type->value_type()->ToString()); } } } else if (type == Type::STRUCT) { RETURN_NOT_OK(ConvertStruct(options_, data, out_buffer)); } else { - std::stringstream ss; - ss << "Unsupported type for object array output: " << col->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported type for object array output: ", + col->type()->ToString()); } placement_data_[rel_placement] = abs_placement; @@ -810,10 +829,9 @@ class IntBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); if (type != ARROW_TYPE) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas int" << sizeof(C_TYPE) << " block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), " to a Pandas int", + sizeof(C_TYPE), " block"); } ConvertIntegerNoNullsSameType(options_, data, out_buffer); @@ -841,10 +859,9 @@ class Float16Block : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::HALF_FLOAT) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float16 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float16 block"); } npy_half* out_buffer = @@ -866,10 +883,9 @@ class Float32Block : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::FLOAT) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float32 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float32 block"); } float* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; @@ -922,10 +938,9 @@ class Float64Block : public PandasBlock { ConvertNumericNullable(data, NAN, out_buffer); break; default: - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float64 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float64 block"); } #undef INTEGER_CASE @@ -945,10 +960,9 @@ class BoolBlock : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::BOOL) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas boolean block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas boolean block"); } uint8_t* out_buffer = @@ -1006,10 +1020,9 @@ class DatetimeBlock : public PandasBlock { return Status::NotImplemented("Unsupported time unit"); } } else { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas datetime block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas datetime block."); } placement_data_[rel_placement] = abs_placement; @@ -1019,7 +1032,8 @@ class DatetimeBlock : public PandasBlock { class DatetimeTZBlock : public DatetimeBlock { public: - DatetimeTZBlock(PandasOptions options, const std::string& timezone, int64_t num_rows) + DatetimeTZBlock(const PandasOptions& options, const std::string& timezone, + int64_t num_rows) : DatetimeBlock(options, num_rows, 1), timezone_(timezone) {} // Like Categorical, the internal ndarray is 1-dimensional @@ -1048,8 +1062,12 @@ class DatetimeTZBlock : public DatetimeBlock { class CategoricalBlock : public PandasBlock { public: - explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows) - : PandasBlock(options, num_rows, 1), pool_(pool), needs_copy_(false) {} + explicit CategoricalBlock(const PandasOptions& options, MemoryPool* pool, + int64_t num_rows) + : PandasBlock(options, num_rows, 1), + pool_(pool), + ordered_(false), + needs_copy_(false) {} Status Allocate() override { return Status::NotImplemented( @@ -1075,9 +1093,8 @@ class CategoricalBlock : public PandasBlock { const T* values = arr.raw_values(); for (int64_t i = 0; i < arr.length(); ++i) { if (arr.IsValid(i) && (values[i] < 0 || values[i] >= dict_length)) { - std::stringstream ss; - ss << "Out of bounds dictionary index: " << static_cast(values[i]); - return Status::Invalid(ss.str()); + return Status::Invalid("Out of bounds dictionary index: ", + static_cast(values[i])); } } return Status::OK(); @@ -1088,16 +1105,15 @@ class CategoricalBlock : public PandasBlock { RETURN_NOT_OK(AllocateNDArrayFromIndices(npy_type, indices_first)); } else { if (options_.zero_copy_only) { - std::stringstream ss; if (needs_copy_) { - ss << "Need to allocate categorical memory, " - << "but only zero-copy conversions allowed."; - } else { - ss << "Needed to copy " << data.num_chunks() << " chunks with " - << indices_first->null_count() - << " indices nulls, but zero_copy_only was True"; + return Status::Invalid("Need to allocate categorical memory, but ", + "only zero-copy conversions " + "allowed"); } - return Status::Invalid(ss.str()); + + return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ", + indices_first->null_count(), + " indices nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateNDArray(npy_type, 1)); @@ -1155,10 +1171,8 @@ class CategoricalBlock : public PandasBlock { RETURN_NOT_OK(WriteIndices(converted_col)); break; default: { - std::stringstream ss; - ss << "Categorical index type not supported: " - << dict_type.index_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Categorical index type not supported: ", + dict_type.index_type()->ToString()); } } @@ -1249,7 +1263,7 @@ class CategoricalBlock : public PandasBlock { bool needs_copy_; }; -Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows, +Status MakeBlock(const PandasOptions& options, PandasBlock::type type, int64_t num_rows, int num_columns, std::shared_ptr* block) { #define BLOCK_CASE(NAME, TYPE) \ case PandasBlock::NAME: \ @@ -1349,10 +1363,8 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options case Type::LIST: { auto list_type = std::static_pointer_cast(col.type()); if (!ListTypeSupported(*list_type->value_type())) { - std::stringstream ss; - ss << "Not implemented type for list in DataFrameBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type for list in DataFrameBlock: ", + list_type->value_type()->ToString()); } *output_type = PandasBlock::OBJECT; } break; @@ -1360,10 +1372,9 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options *output_type = PandasBlock::CATEGORICAL; break; default: - std::stringstream ss; - ss << "No known equivalent Pandas block for Arrow data of type "; - ss << col.type()->ToString() << " is known."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "No known equivalent Pandas block for Arrow data of type ", + col.type()->ToString(), " is known."); } return Status::OK(); } @@ -1535,7 +1546,7 @@ class DataFrameBlockCreator { class ArrowDeserializer { public: - ArrowDeserializer(PandasOptions options, const std::shared_ptr& col, + ArrowDeserializer(const PandasOptions& options, const std::shared_ptr& col, PyObject* py_ref) : col_(col), data_(*col->data().get()), options_(options), py_ref_(py_ref) {} @@ -1549,7 +1560,7 @@ class ArrowDeserializer { } template - Status ConvertValuesZeroCopy(PandasOptions options, int npy_type, + Status ConvertValuesZeroCopy(const PandasOptions& options, int npy_type, const std::shared_ptr& arr) { typedef typename internal::arrow_traits::T T; @@ -1657,10 +1668,8 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0) { return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); } else if (options_.zero_copy_only) { - std::stringstream ss; - ss << "Needed to copy " << data_.num_chunks() << " chunks with " - << data_.null_count() << " nulls, but zero_copy_only was True"; - return Status::Invalid(ss.str()); + return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", + data_.null_count(), " nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateOutput(npy_type)); @@ -1751,17 +1760,13 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0) { return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); } else if (options_.zero_copy_only) { - std::stringstream ss; - ss << "Needed to copy " << data_.num_chunks() << " chunks with " - << data_.null_count() << " nulls, but zero_copy_only was True"; - return Status::Invalid(ss.str()); + return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", + data_.null_count(), " nulls, but zero_copy_only was True"); } if (data_.null_count() > 0) { if (options_.integer_object_nulls) { - using c_type = typename Type::c_type; - - return VisitObjects(ConvertIntegerObjects); + return VisitObjects(ConvertIntegerObjects); } else { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); @@ -1854,9 +1859,8 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(Decimal128Type, DECIMAL) CONVERTVALUES_LISTSLIKE_CASE(ListType, LIST) default: { - std::stringstream ss; - ss << "Not implemented type for lists: " << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type for lists: ", + list_type->value_type()->ToString()); } } #undef CONVERTVALUES_LISTSLIKE_CASE @@ -1900,15 +1904,16 @@ class ArrowDeserializer { PyObject* result_; }; -Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out) { +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out) { static std::string dummy_name = "dummy"; auto field = std::make_shared(dummy_name, arr->type()); auto col = std::make_shared(field, arr); return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertChunkedArrayToPandas(PandasOptions options, +Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& ca, PyObject* py_ref, PyObject** out) { static std::string dummy_name = "dummy"; @@ -1917,19 +1922,21 @@ Status ConvertChunkedArrayToPandas(PandasOptions options, return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, - PyObject* py_ref, PyObject** out) { +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out) { ArrowDeserializer converter(options, col, py_ref); return converter.Convert(out); } -Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, - MemoryPool* pool, PyObject** out) { +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out) { return ConvertTableToPandas(options, std::unordered_set(), table, pool, out); } -Status ConvertTableToPandas(PandasOptions options, +Status ConvertTableToPandas(const PandasOptions& options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, PyObject** out) { diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 753bf4823566b..20bad40971020 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -43,32 +43,32 @@ namespace py { struct PandasOptions { /// If true, we will convert all string columns to categoricals - bool strings_to_categorical; - bool zero_copy_only; - bool integer_object_nulls; - bool date_as_object; - bool use_threads; - - PandasOptions() - : strings_to_categorical(false), - zero_copy_only(false), - integer_object_nulls(false), - date_as_object(false), - use_threads(false) {} + bool strings_to_categorical = false; + bool zero_copy_only = false; + bool integer_object_nulls = false; + bool date_as_object = false; + bool use_threads = false; + + /// \brief If true, do not create duplicate PyObject versions of equal + /// objects. This only applies to immutable objects like strings or datetime + /// objects + bool deduplicate_objects = false; }; ARROW_PYTHON_EXPORT -Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out); +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out); ARROW_PYTHON_EXPORT -Status ConvertChunkedArrayToPandas(PandasOptions options, +Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); ARROW_PYTHON_EXPORT -Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, - PyObject* py_ref, PyObject** out); +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); // Convert a whole table as efficiently as possible to a pandas.DataFrame. // @@ -77,15 +77,16 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& table, - MemoryPool* pool, PyObject** out); +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out); /// Convert a whole table as efficiently as possible to a pandas.DataFrame. /// /// Explicitly name columns that should be a categorical /// This option is only used on conversions that are applied to a table. ARROW_PYTHON_EXPORT -Status ConvertTableToPandas(PandasOptions options, +Status ConvertTableToPandas(const PandasOptions& options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, PyObject** out); diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 6587bd328f3fb..6e41beddd1b72 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -215,10 +215,8 @@ struct PyBytesView { this->ref.reset(); return Status::OK(); } else { - std::stringstream ss; - ss << "Expected " << expected_msg << ", got a '" << Py_TYPE(obj)->tp_name - << "' object"; - return Status::TypeError(ss.str()); + return Status::TypeError("Expected ", expected_msg, ", got a '", + Py_TYPE(obj)->tp_name, "' object"); } } diff --git a/cpp/src/arrow/python/decimal.cc b/cpp/src/arrow/python/decimal.cc index 051f31faacacf..8db7c01b9ab8b 100644 --- a/cpp/src/arrow/python/decimal.cc +++ b/cpp/src/arrow/python/decimal.cc @@ -125,11 +125,9 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arr const int32_t scale = arrow_type.scale(); if (ARROW_PREDICT_FALSE(inferred_precision > precision)) { - std::stringstream buf; - buf << "Decimal type with precision " << inferred_precision - << " does not fit into precision inferred from first array element: " - << precision; - return Status::Invalid(buf.str()); + return Status::Invalid( + "Decimal type with precision ", inferred_precision, + " does not fit into precision inferred from first array element: ", precision); } if (scale != inferred_scale) { diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 2f43db6505c67..28ed1a6c364dc 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -164,11 +164,10 @@ namespace { Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) { if (overflow_message.empty()) { - std::stringstream ss; std::string obj_as_stdstring; RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring)); - ss << "Value " << obj_as_stdstring << " too large to fit in C integer type"; - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", obj_as_stdstring, + " too large to fit in C integer type"); } else { return Status::Invalid(overflow_message); } @@ -299,13 +298,10 @@ bool PandasObjectIsNull(PyObject* obj) { } Status InvalidValue(PyObject* obj, const std::string& why) { - std::stringstream ss; - std::string obj_as_str; RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str)); - ss << "Could not convert " << obj_as_str << " with type " << Py_TYPE(obj)->tp_name - << ": " << why; - return Status::Invalid(ss.str()); + return Status::Invalid("Could not convert ", obj_as_str, " with type ", + Py_TYPE(obj)->tp_name, ": ", why); } Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) { @@ -355,10 +351,8 @@ Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) { constexpr int64_t kDoubleMin = -(1LL << 53); if (value < kDoubleMin || value > kDoubleMax) { - std::stringstream ss; - ss << "Integer value " << value << " is outside of the range exactly" - << " representable by a IEEE 754 double precision value"; - return Status::Invalid(ss.str()); + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 double precision value"); } *out = static_cast(value); return Status::OK(); @@ -372,10 +366,8 @@ Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) { constexpr int64_t kFloatMin = -(1LL << 24); if (value < kFloatMin || value > kFloatMax) { - std::stringstream ss; - ss << "Integer value " << value << " is outside of the range exactly" - << " representable by a IEEE 754 single precision value"; - return Status::Invalid(ss.str()); + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 single precision value"); } *out = static_cast(value); return Status::OK(); diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 0f1d85ead2a16..c9db5f4f28531 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -58,10 +58,9 @@ class NumPyDtypeUnifier { NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(NULLPTR) {} Status InvalidMix(int new_dtype) { - std::stringstream ss; - ss << "Cannot mix NumPy dtypes " << GetNumPyTypeName(current_type_num_) << " and " - << GetNumPyTypeName(new_dtype); - return Status::Invalid(ss.str()); + return Status::Invalid("Cannot mix NumPy dtypes ", + GetNumPyTypeName(current_type_num_), " and ", + GetNumPyTypeName(new_dtype)); } int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; } @@ -250,9 +249,7 @@ class NumPyDtypeUnifier { action = Observe_DATETIME(descr); break; default: - std::stringstream ss; - ss << "Unsupported numpy type " << GetNumPyTypeName(dtype) << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype)); } if (action == INVALID) { @@ -480,10 +477,8 @@ class TypeInferrer { } else if (PyBytes_Check(key_obj)) { key = internal::PyBytes_AsStdString(key_obj); } else { - std::stringstream ss; - ss << "Expected dict key of type str or bytes, got '" << Py_TYPE(key_obj)->tp_name - << "'"; - return Status::TypeError(ss.str()); + return Status::TypeError("Expected dict key of type str or bytes, got '", + Py_TYPE(key_obj)->tp_name, "'"); } // Get or create visitor for this key auto it = struct_inferrers_.find(key); diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 463795a2109f0..6954e35c3e199 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -143,9 +143,8 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { TYPE_VISIT_INLINE(DATETIME); TYPE_VISIT_INLINE(OBJECT); } - std::stringstream ss; - ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr)); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("NumPy type not implemented: ", + GetNumPyTypeName(PyArray_TYPE(arr))); } #undef TYPE_VISIT_INLINE diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index d95e337a4870d..c73e0bc15c9c5 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -92,9 +92,7 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); default: { - std::stringstream ss; - ss << "Unsupported numpy type " << descr->type_num << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } } return Status::OK(); @@ -119,9 +117,7 @@ Status GetNumPyType(const DataType& type, int* type_num) { NUMPY_TYPE_CASE(FLOAT, FLOAT32); NUMPY_TYPE_CASE(DOUBLE, FLOAT64); default: { - std::stringstream ss; - ss << "Unsupported tensor type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported tensor type: ", type.ToString()); } } #undef NUMPY_TYPE_CASE @@ -181,9 +177,7 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { } } break; default: { - std::stringstream ss; - ss << "Unsupported numpy type " << descr->type_num << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } } diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index f9a5ea1b0d67e..a944b80914189 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ namespace arrow { using internal::checked_cast; using internal::CopyBitmap; +using internal::GenerateBitsUnrolled; namespace py { @@ -245,6 +247,11 @@ class NumPyConverter { return Status::OK(); } + // Called before ConvertData to ensure Numpy input buffer is in expected + // Arrow layout + template + Status PrepareInputData(std::shared_ptr* data); + // ---------------------------------------------------------------------- // Traditional visitor conversion for non-object arrays @@ -282,9 +289,8 @@ class NumPyConverter { } Status TypeNotImplemented(std::string type_name) { - std::stringstream ss; - ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. "; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("NumPyConverter doesn't implement <", type_name, + "> conversion. "); } MemoryPool* pool_; @@ -407,57 +413,49 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po } // namespace template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { if (is_strided()) { RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); + } else if (dtype_->type_num == NPY_BOOL) { + int64_t nbytes = BitUtil::BytesForBits(length_); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); + + Ndarray1DIndexer values(arr_); + int64_t i = 0; + const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; + GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); + + *data = buffer; } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } - std::shared_ptr input_type; - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - - if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, - pool_, data)); - } - return Status::OK(); } -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - int64_t nbytes = BitUtil::BytesForBits(length_); - std::shared_ptr buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); - - Ndarray1DIndexer values(arr_); +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + RETURN_NOT_OK(PrepareInputData(data)); - uint8_t* bitmap = buffer->mutable_data(); + std::shared_ptr input_type; + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); - memset(bitmap, 0, nbytes); - for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { - BitUtil::SetBit(bitmap, i); - } + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, + cast_options_, pool_, data)); } - *data = buffer; return Status::OK(); } template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted @@ -479,8 +477,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, - cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); } } @@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - constexpr int64_t kMillisecondsInDay = 86400000; - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted @@ -526,62 +518,61 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, - cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); } } return Status::OK(); } +// Create 16MB chunks for binary data +constexpr int32_t kBinaryChunksize = 1 << 24; + Status NumPyConverter::Visit(const BinaryType& type) { - BinaryBuilder builder(pool_); + ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_); auto data = reinterpret_cast(PyArray_DATA(arr_)); - int item_length = 0; + auto AppendNotNull = [&builder, this](const uint8_t* data) { + // This is annoying. NumPy allows strings to have nul-terminators, so + // we must check for them here + const size_t item_size = + strnlen(reinterpret_cast(data), static_cast(itemsize_)); + return builder.Append(data, static_cast(item_size)); + }; + if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder.AppendNull()); } else { - // This is annoying. NumPy allows strings to have nul-terminators, so - // we must check for them here - for (item_length = 0; item_length < itemsize_; ++item_length) { - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { - for (item_length = 0; item_length < itemsize_; ++item_length) { - // Look for nul-terminator - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); data += stride_; } } - std::shared_ptr result; + ArrayVector result; RETURN_NOT_OK(builder.Finish(&result)); - return PushArray(result->data()); + for (auto arr : result) { + RETURN_NOT_OK(PushArray(arr->data())); + } + return Status::OK(); } Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { auto byte_width = type.byte_width(); if (itemsize_ != byte_width) { - std::stringstream ss; - ss << "Got bytestring of length " << itemsize_ << " (expected " << byte_width << ")"; - return Status::Invalid(ss.str()); + return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ", + byte_width, ")"); } FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); @@ -656,9 +647,8 @@ Status NumPyConverter::Visit(const StringType& type) { if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { return builder.Append(data, itemsize_); } else { - std::stringstream ss; - ss << "Encountered non-UTF8 binary value: " << HexEncode(data, itemsize_); - return Status::Invalid(ss.str()); + return Status::Invalid("Encountered non-UTF8 binary value: ", + HexEncode(data, itemsize_)); } } else { return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, @@ -702,9 +692,7 @@ Status NumPyConverter::Visit(const StructType& type) { for (auto field : type.children()) { PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str()); if (tup == NULL) { - std::stringstream ss; - ss << "Missing field '" << field->name() << "' in struct array"; - return Status::TypeError(ss.str()); + return Status::TypeError("Missing field '", field->name(), "' in struct array"); } PyArray_Descr* sub_dtype = reinterpret_cast(PyTuple_GET_ITEM(tup, 0)); diff --git a/cpp/src/arrow/python/platform.h b/cpp/src/arrow/python/platform.h index 4dc944e40900e..ca9b553fd641a 100644 --- a/cpp/src/arrow/python/platform.h +++ b/cpp/src/arrow/python/platform.h @@ -26,8 +26,10 @@ #include // Work around C2528 error +#ifdef _MSC_VER #if _MSC_VER >= 1900 #undef timezone #endif +#endif #endif // ARROW_PYTHON_PLATFORM_H diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 2d15ce45b3b7f..7443c54845630 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -25,6 +25,7 @@ #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/util/decimal.h" #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/decimal.h" diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a77cebc7e7d50..f5e6a5776071d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -402,10 +402,7 @@ class TimestampConverter : public TypedConverter type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); if (type->id() != Type::TIMESTAMP) { - std::ostringstream ss; - ss << "Expected np.datetime64 but got: "; - ss << type->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected np.datetime64 but got: ", type->ToString()); } const TimestampType& ttype = checked_cast(*type); if (unit_ != ttype.unit()) { @@ -705,10 +702,7 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) { return value_converter_->AppendSingleVirtual(obj); } default: { - std::stringstream ss; - ss << "Unknown list item type: "; - ss << value_type_->ToString(); - return Status::TypeError(ss.str()); + return Status::TypeError("Unknown list item type: ", value_type_->ToString()); } } } @@ -911,9 +905,8 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, new StructConverter(from_pandas, strict_conversions)); break; default: - std::stringstream ss; - ss << "Sequence converter for type " << type->ToString() << " not implemented"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Sequence converter for type ", type->ToString(), + " not implemented"); } return Status::OK(); } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 7911557ee73e0..ad2636af60c63 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -55,8 +55,8 @@ using internal::checked_cast; namespace py { -/// A Sequence is a heterogeneous collections of elements. It can contain -/// scalar Python types, lists, tuples, dictionaries and tensors. +// A Sequence is a heterogeneous collections of elements. It can contain +// scalar Python types, lists, tuples, dictionaries and tensors. class SequenceBuilder { public: explicit SequenceBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) @@ -81,7 +81,7 @@ class SequenceBuilder { dict_offsets_({0}), set_offsets_({0}) {} - /// Appending a none to the sequence + // Appending a none to the sequence Status AppendNone() { RETURN_NOT_OK(offsets_.Append(0)); RETURN_NOT_OK(types_.Append(0)); @@ -106,90 +106,90 @@ class SequenceBuilder { return out->Append(val); } - /// Appending a boolean to the sequence + // Appending a boolean to the sequence Status AppendBool(const bool data) { return AppendPrimitive(data, &bool_tag_, &bools_); } - /// Appending a python 2 int64_t to the sequence + // Appending a python 2 int64_t to the sequence Status AppendPy2Int64(const int64_t data) { return AppendPrimitive(data, &py2_int_tag_, &py2_ints_); } - /// Appending an int64_t to the sequence + // Appending an int64_t to the sequence Status AppendInt64(const int64_t data) { return AppendPrimitive(data, &int_tag_, &ints_); } - /// Append a list of bytes to the sequence + // Append a list of bytes to the sequence Status AppendBytes(const uint8_t* data, int32_t length) { RETURN_NOT_OK(Update(bytes_.length(), &bytes_tag_)); return bytes_.Append(data, length); } - /// Appending a string to the sequence + // Appending a string to the sequence Status AppendString(const char* data, int32_t length) { RETURN_NOT_OK(Update(strings_.length(), &string_tag_)); return strings_.Append(data, length); } - /// Appending a half_float to the sequence + // Appending a half_float to the sequence Status AppendHalfFloat(const npy_half data) { return AppendPrimitive(data, &half_float_tag_, &half_floats_); } - /// Appending a float to the sequence + // Appending a float to the sequence Status AppendFloat(const float data) { return AppendPrimitive(data, &float_tag_, &floats_); } - /// Appending a double to the sequence + // Appending a double to the sequence Status AppendDouble(const double data) { return AppendPrimitive(data, &double_tag_, &doubles_); } - /// Appending a Date64 timestamp to the sequence + // Appending a Date64 timestamp to the sequence Status AppendDate64(const int64_t timestamp) { return AppendPrimitive(timestamp, &date64_tag_, &date64s_); } - /// Appending a tensor to the sequence - /// - /// \param tensor_index Index of the tensor in the object. + // Appending a tensor to the sequence + // + // \param tensor_index Index of the tensor in the object. Status AppendTensor(const int32_t tensor_index) { RETURN_NOT_OK(Update(tensor_indices_.length(), &tensor_tag_)); return tensor_indices_.Append(tensor_index); } - /// Appending a numpy ndarray to the sequence - /// - /// \param tensor_index Index of the tensor in the object. + // Appending a numpy ndarray to the sequence + // + // \param tensor_index Index of the tensor in the object. Status AppendNdarray(const int32_t ndarray_index) { RETURN_NOT_OK(Update(ndarray_indices_.length(), &ndarray_tag_)); return ndarray_indices_.Append(ndarray_index); } - /// Appending a buffer to the sequence - /// - /// \param buffer_index Indes of the buffer in the object. + // Appending a buffer to the sequence + // + // \param buffer_index Indes of the buffer in the object. Status AppendBuffer(const int32_t buffer_index) { RETURN_NOT_OK(Update(buffer_indices_.length(), &buffer_tag_)); return buffer_indices_.Append(buffer_index); } - /// Add a sublist to the sequence. The data contained in the sublist will be - /// specified in the "Finish" method. - /// - /// To construct l = [[11, 22], 33, [44, 55]] you would for example run - /// list = ListBuilder(); - /// list.AppendList(2); - /// list.Append(33); - /// list.AppendList(2); - /// list.Finish([11, 22, 44, 55]); - /// list.Finish(); + // Add a sublist to the sequence. The data contained in the sublist will be + // specified in the "Finish" method. + // + // To construct l = [[11, 22], 33, [44, 55]] you would for example run + // list = ListBuilder(); + // list.AppendList(2); + // list.Append(33); + // list.AppendList(2); + // list.Finish([11, 22, 44, 55]); + // list.Finish(); - /// \param size - /// The size of the sublist + // \param size + // The size of the sublist Status AppendList(Py_ssize_t size) { int32_t offset; RETURN_NOT_OK(internal::CastSize(list_offsets_.back() + size, &offset)); @@ -256,8 +256,8 @@ class SequenceBuilder { return Status::OK(); } - /// Finish building the sequence and return the result. - /// Input arrays may be nullptr + // Finish building the sequence and return the result. + // Input arrays may be nullptr Status Finish(const Array* list_data, const Array* tuple_data, const Array* dict_data, const Array* set_data, std::shared_ptr* out) { fields_.resize(num_tags_); @@ -356,28 +356,28 @@ class SequenceBuilder { std::vector type_ids_; }; -/// Constructing dictionaries of key/value pairs. Sequences of -/// keys and values are built separately using a pair of -/// SequenceBuilders. The resulting Arrow representation -/// can be obtained via the Finish method. +// Constructing dictionaries of key/value pairs. Sequences of +// keys and values are built separately using a pair of +// SequenceBuilders. The resulting Arrow representation +// can be obtained via the Finish method. class DictBuilder { public: explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {} - /// Builder for the keys of the dictionary + // Builder for the keys of the dictionary SequenceBuilder& keys() { return keys_; } - /// Builder for the values of the dictionary + // Builder for the values of the dictionary SequenceBuilder& vals() { return vals_; } - /// Construct an Arrow StructArray representing the dictionary. - /// Contains a field "keys" for the keys and "vals" for the values. - /// \param val_list_data - /// List containing the data from nested lists in the value - /// list of the dictionary - /// - /// \param val_dict_data - /// List containing the data from nested dictionaries in the - /// value list of the dictionary + // Construct an Arrow StructArray representing the dictionary. + // Contains a field "keys" for the keys and "vals" for the values. + // \param val_list_data + // List containing the data from nested lists in the value + // list of the dictionary + // + // \param val_dict_data + // List containing the data from nested dictionaries in the + // value list of the dictionary Status Finish(const Array* key_tuple_data, const Array* key_dict_data, const Array* val_list_data, const Array* val_tuple_data, const Array* val_dict_data, const Array* val_set_data, @@ -407,10 +407,9 @@ Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* el PyObject** result) { *result = NULL; if (context == Py_None) { - std::stringstream ss; - ss << "error while calling callback on " << internal::PyObject_StdStringRepr(elem) - << ": handler not registered"; - return Status::SerializationError(ss.str()); + return Status::SerializationError("error while calling callback on ", + internal::PyObject_StdStringRepr(elem), + ": handler not registered"); } else { *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL); return PassPyError(); @@ -533,13 +532,13 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, #endif } else if (PyBytes_Check(elem)) { auto data = reinterpret_cast(PyBytes_AS_STRING(elem)); - int32_t size; + int32_t size = -1; RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size)); RETURN_NOT_OK(builder->AppendBytes(data, size)); } else if (PyUnicode_Check(elem)) { PyBytesView view; RETURN_NOT_OK(view.FromString(elem)); - int32_t size; + int32_t size = -1; RETURN_NOT_OK(internal::CastSize(view.size, &size)); RETURN_NOT_OK(builder->AppendString(view.bytes, size)); } else if (PyList_CheckExact(elem)) { diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index d90517a60a28a..bc71ec4e90bd0 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -149,6 +149,7 @@ template <> struct arrow_traits { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; + typedef typename npy_traits::value_type T; }; #define INT_DECL(TYPE) \ diff --git a/cpp/src/arrow/python/util/CMakeLists.txt b/cpp/src/arrow/python/util/CMakeLists.txt index 8edde12558fd8..30c75ef4509a3 100644 --- a/cpp/src/arrow/python/util/CMakeLists.txt +++ b/cpp/src/arrow/python/util/CMakeLists.txt @@ -25,13 +25,13 @@ if (PYARROW_BUILD_TESTS) if (APPLE) target_link_libraries(arrow/python_test_main - gtest_static + ${GTEST_LIBRARY} dl) set_target_properties(arrow/python_test_main PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() target_link_libraries(arrow/python_test_main - gtest_static + ${GTEST_LIBRARY} pthread dl ) diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index 7350deadcc67f..dc462972c57b7 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -199,9 +199,7 @@ static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, switch (unit) { case TimeUnit::NANO: if (val % 1000 != 0) { - std::stringstream ss; - ss << "Value " << val << " has non-zero nanoseconds"; - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", val, " has non-zero nanoseconds"); } val /= 1000; // fall through diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 33287c19ffdde..baaf5cb17500f 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -95,16 +95,13 @@ class SimpleRecordBatch : public RecordBatch { DCHECK(column != nullptr); if (!field->type()->Equals(column->type())) { - std::stringstream ss; - ss << "Column data type " << field->type()->name() - << " does not match field data type " << column->type()->name(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column data type ", field->type()->name(), + " does not match field data type ", column->type()->name()); } if (column->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match record batch's length. Expected length " - << num_rows_ << " but got length " << column->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match record batch's length. Expected length ", + num_rows_, " but got length ", column->length()); } std::shared_ptr new_schema; @@ -229,17 +226,14 @@ Status RecordBatch::Validate() const { auto arr_shared = this->column_data(i); const ArrayData& arr = *arr_shared; if (arr.length != num_rows_) { - std::stringstream ss; - ss << "Number of rows in column " << i << " did not match batch: " << arr.length - << " vs " << num_rows_; - return Status::Invalid(ss.str()); + return Status::Invalid("Number of rows in column ", i, + " did not match batch: ", arr.length, " vs ", num_rows_); } const auto& schema_type = *schema_->field(i)->type(); if (!arr.type->Equals(schema_type)) { - std::stringstream ss; - ss << "Column " << i << " type not match schema: " << arr.type->ToString() << " vs " - << schema_type.ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, + " type not match schema: ", arr.type->ToString(), " vs ", + schema_type.ToString()); } } return Status::OK(); diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 674b68b40fa6e..ceb6885da621e 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -32,6 +32,7 @@ namespace arrow { class Array; struct ArrayData; class Status; +class Table; /// \class RecordBatch /// \brief Collection of equal-length arrays matching a particular Schema diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc new file mode 100644 index 0000000000000..ed51f03f88841 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -0,0 +1,263 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Unit tests for DataType (and subclasses), Field, and Schema + +#include +#include +#include +#include + +#include + +#include + +#include "arrow/sparse_tensor.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, + const SparseTensor& sparse_tensor) { + ASSERT_EQ(expected, sparse_tensor.format_id()); + ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); +} + +static inline void AssertCOOIndex( + const std::shared_ptr& sidx, const int64_t nth, + const std::vector& expected_values) { + int64_t n = static_cast(expected_values.size()); + for (int64_t i = 0; i < n; ++i) { + ASSERT_EQ(expected_values[i], sidx->Value({nth, i})); + } +} + +TEST(TestSparseCOOTensor, CreationEmptyTensor) { + std::vector shape = {2, 3, 4}; + SparseTensorImpl st1(int64(), shape); + + std::vector dim_names = {"foo", "bar", "baz"}; + SparseTensorImpl st2(int64(), shape, dim_names); + + ASSERT_EQ(0, st1.non_zero_length()); + ASSERT_EQ(0, st2.non_zero_length()); + + ASSERT_EQ(24, st1.size()); + ASSERT_EQ(24, st2.size()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); +} + +TEST(TestSparseCOOTensor, CreationFromNumericTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + std::vector dim_names = {"foo", "bar", "baz"}; + NumericTensor tensor1(buffer, shape); + NumericTensor tensor2(buffer, shape, {}, dim_names); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + + CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); + + ASSERT_EQ(12, st1.non_zero_length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* raw_data = reinterpret_cast(st1.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); + + const auto& si = internal::checked_cast(*st1.sparse_index()); + ASSERT_EQ(std::string("SparseCOOIndex"), si.ToString()); + + std::shared_ptr sidx = si.indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + AssertCOOIndex(sidx, 0, {0, 0, 0}); + AssertCOOIndex(sidx, 1, {0, 0, 2}); + AssertCOOIndex(sidx, 2, {0, 1, 1}); + AssertCOOIndex(sidx, 10, {1, 2, 1}); + AssertCOOIndex(sidx, 11, {1, 2, 3}); +} + +TEST(TestSparseCOOTensor, CreationFromTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + std::vector dim_names = {"foo", "bar", "baz"}; + Tensor tensor1(int64(), buffer, shape); + Tensor tensor2(int64(), buffer, shape, {}, dim_names); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + + ASSERT_EQ(12, st1.non_zero_length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* raw_data = reinterpret_cast(st1.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); + + const auto& si = internal::checked_cast(*st1.sparse_index()); + std::shared_ptr sidx = si.indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + AssertCOOIndex(sidx, 0, {0, 0, 0}); + AssertCOOIndex(sidx, 1, {0, 0, 2}); + AssertCOOIndex(sidx, 2, {0, 1, 1}); + AssertCOOIndex(sidx, 10, {1, 2, 1}); + AssertCOOIndex(sidx, 11, {1, 2, 3}); +} + +TEST(TestSparseCOOTensor, CreationFromNonContiguousTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, + 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; + std::vector strides = {192, 64, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, strides); + SparseTensorImpl st(tensor); + + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); + + const int64_t* raw_data = reinterpret_cast(st.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); + + const auto& si = internal::checked_cast(*st.sparse_index()); + std::shared_ptr sidx = si.indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + AssertCOOIndex(sidx, 0, {0, 0, 0}); + AssertCOOIndex(sidx, 1, {0, 0, 2}); + AssertCOOIndex(sidx, 2, {0, 1, 1}); + AssertCOOIndex(sidx, 10, {1, 2, 1}); + AssertCOOIndex(sidx, 11, {1, 2, 3}); +} + +TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { + std::vector shape = {6, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + std::vector dim_names = {"foo", "bar", "baz"}; + NumericTensor tensor1(buffer, shape); + NumericTensor tensor2(buffer, shape, {}, dim_names); + + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + + CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); + + ASSERT_EQ(12, st1.non_zero_length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* raw_data = reinterpret_cast(st1.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); + + const auto& si = internal::checked_cast(*st1.sparse_index()); + ASSERT_EQ(std::string("SparseCSRIndex"), si.ToString()); + ASSERT_EQ(1, si.indptr()->ndim()); + ASSERT_EQ(1, si.indices()->ndim()); + + const int64_t* indptr_begin = reinterpret_cast(si.indptr()->raw_data()); + std::vector indptr_values(indptr_begin, + indptr_begin + si.indptr()->shape()[0]); + + ASSERT_EQ(7, indptr_values.size()); + ASSERT_EQ(std::vector({0, 2, 4, 6, 8, 10, 12}), indptr_values); + + const int64_t* indices_begin = + reinterpret_cast(si.indices()->raw_data()); + std::vector indices_values(indices_begin, + indices_begin + si.indices()->shape()[0]); + + ASSERT_EQ(12, indices_values.size()); + ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); +} + +TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { + std::vector shape = {6, 4}; + std::vector values = {1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 0, 0, 11, 0, 0, 0, 12, 0, + 13, 0, 0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 16, 0}; + std::vector strides = {64, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, strides); + SparseTensorImpl st(tensor); + + ASSERT_EQ(12, st.non_zero_length()); + ASSERT_TRUE(st.is_mutable()); + + const int64_t* raw_data = reinterpret_cast(st.raw_data()); + AssertNumericDataEqual(raw_data, {1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16}); + + const auto& si = internal::checked_cast(*st.sparse_index()); + ASSERT_EQ(1, si.indptr()->ndim()); + ASSERT_EQ(1, si.indices()->ndim()); + + const int64_t* indptr_begin = reinterpret_cast(si.indptr()->raw_data()); + std::vector indptr_values(indptr_begin, + indptr_begin + si.indptr()->shape()[0]); + + ASSERT_EQ(7, indptr_values.size()); + ASSERT_EQ(std::vector({0, 2, 4, 6, 8, 10, 12}), indptr_values); + + const int64_t* indices_begin = + reinterpret_cast(si.indices()->raw_data()); + std::vector indices_values(indices_begin, + indices_begin + si.indices()->shape()[0]); + + ASSERT_EQ(12, indices_values.size()); + ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); +} + +} // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc new file mode 100644 index 0000000000000..a55f51a56733f --- /dev/null +++ b/cpp/src/arrow/sparse_tensor.cc @@ -0,0 +1,452 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/sparse_tensor.h" + +#include +#include +#include + +#include "arrow/compare.h" +#include "arrow/util/logging.h" + +namespace arrow { + +namespace { + +// ---------------------------------------------------------------------- +// SparseTensorConverter + +template +class SparseTensorConverter { + public: + explicit SparseTensorConverter(const NumericTensor&) {} + + Status Convert() { return Status::Invalid("Unsupported sparse index"); } +}; + +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCOOIndex + +template +struct SparseTensorConverterBase { + using NumericTensorType = NumericTensor; + using value_type = typename NumericTensorType::value_type; + + explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {} + + bool TensorIsTriviallyIterable() const { + return tensor_.ndim() <= 1 || tensor_.is_contiguous(); + } + + size_t CountNonZero() const { + if (tensor_.size() == 0) { + return 0; + } + + if (TensorIsTriviallyIterable()) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + return std::count_if(data, data + tensor_.size(), + [](value_type x) { return x != 0; }); + } + + const std::vector& shape = tensor_.shape(); + const int64_t ndim = tensor_.ndim(); + + size_t count = 0; + std::vector coord(ndim, 0); + for (int64_t n = tensor_.size(); n > 0; n--) { + if (tensor_.Value(coord) != 0) { + ++count; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + return count; + } + + const NumericTensorType& tensor_; +}; + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + + Status Convert() { + const int64_t ndim = tensor_.ndim(); + const int64_t nonzero_count = static_cast(CountNonZero()); + + std::shared_ptr indices_buffer; + RETURN_NOT_OK( + AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + + std::shared_ptr values_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; + for (int64_t i = 0; i < count; ++i, ++data) { + if (*data != 0) { + *indices++ = i; + *values++ = *data; + } + } + } else { + const std::vector& shape = tensor_.shape(); + std::vector coord(ndim, 0); + + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); + if (tensor_.Value(coord) != 0) { + *values++ = x; + + int64_t* indp = indices; + for (int64_t i = 0; i < ndim; ++i) { + *indp = coord[i]; + indp += nonzero_count; + } + indices++; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + } + + // make results + const std::vector indices_shape = {nonzero_count, ndim}; + const int64_t indices_elsize = sizeof(int64_t); + const std::vector indices_strides = {indices_elsize, + indices_elsize * nonzero_count}; + sparse_index = + std::make_shared(std::make_shared( + indices_buffer, indices_shape, indices_strides)); + data = values_buffer; + + return Status::OK(); + } + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using SparseTensorConverterBase::tensor_; + using SparseTensorConverterBase::CountNonZero; +}; + +template +void MakeSparseTensorFromTensor(const Tensor& tensor, + std::shared_ptr* sparse_index, + std::shared_ptr* data) { + NumericTensor numeric_tensor(tensor.data(), tensor.shape(), tensor.strides()); + SparseTensorConverter converter(numeric_tensor); + DCHECK_OK(converter.Convert()); + *sparse_index = converter.sparse_index; + *data = converter.data; +} + +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSRIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + + Status Convert() { + const int64_t ndim = tensor_.ndim(); + if (ndim > 2) { + return Status::Invalid("Invalid tensor dimension"); + } + + const int64_t nr = tensor_.shape()[0]; + const int64_t nc = tensor_.shape()[1]; + const int64_t nonzero_count = static_cast(CountNonZero()); + + std::shared_ptr indptr_buffer; + std::shared_ptr indices_buffer; + + std::shared_ptr values_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + 1), &indptr_buffer)); + int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + + int64_t k = 0; + *indptr++ = 0; + for (int64_t i = 0; i < nr; ++i) { + for (int64_t j = 0; j < nc; ++j) { + const value_type x = tensor_.Value({i, j}); + if (x != 0) { + *values++ = x; + *indices++ = j; + k++; + } + } + *indptr++ = k; + } + } + + std::vector indptr_shape({nr + 1}); + std::shared_ptr indptr_tensor = + std::make_shared(indptr_buffer, indptr_shape); + + std::vector indices_shape({nonzero_count}); + std::shared_ptr indices_tensor = + std::make_shared(indices_buffer, indices_shape); + + sparse_index = std::make_shared(indptr_tensor, indices_tensor); + data = values_buffer; + + return Status::OK(); + } + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::tensor_; + using SparseTensorConverterBase::CountNonZero; +}; + +// ---------------------------------------------------------------------- +// Instantiate templates + +#define INSTANTIATE_SPARSE_TENSOR_CONVERTER(IndexType) \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter; \ + template class SparseTensorConverter + +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); + +} // namespace + +// ---------------------------------------------------------------------- +// SparseCOOIndex + +// Constructor with a column-major NumericTensor +SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) + : SparseIndexBase(coords->shape()[0]), coords_(coords) { + DCHECK(coords_->is_column_major()); +} + +std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } + +// ---------------------------------------------------------------------- +// SparseCSRIndex + +// Constructor with two index vectors +SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices) + : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) { + DCHECK_EQ(1, indptr_->ndim()); + DCHECK_EQ(1, indices_->ndim()); +} + +std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRIndex"); } + +// ---------------------------------------------------------------------- +// SparseTensor + +// Constructor with all attributes +SparseTensor::SparseTensor(const std::shared_ptr& type, + const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names) + : type_(type), + data_(data), + shape_(shape), + sparse_index_(sparse_index), + dim_names_(dim_names) { + DCHECK(is_tensor_supported(type->id())); +} + +const std::string& SparseTensor::dim_name(int i) const { + static const std::string kEmpty = ""; + if (dim_names_.size() == 0) { + return kEmpty; + } else { + DCHECK_LT(i, static_cast(dim_names_.size())); + return dim_names_[i]; + } +} + +int64_t SparseTensor::size() const { + return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); +} + +bool SparseTensor::Equals(const SparseTensor& other) const { + return SparseTensorEquals(*this, other); +} + +// ---------------------------------------------------------------------- +// SparseTensorImpl + +// Constructor with a dense tensor +template +SparseTensorImpl::SparseTensorImpl( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names) + : SparseTensorImpl(nullptr, type, nullptr, shape, dim_names) {} + +// Constructor with a dense tensor +template +template +SparseTensorImpl::SparseTensorImpl(const NumericTensor& tensor) + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), + tensor.dim_names_) { + SparseTensorConverter converter(tensor); + DCHECK_OK(converter.Convert()); + sparse_index_ = converter.sparse_index; + data_ = converter.data; +} + +// Constructor with a dense tensor +template +SparseTensorImpl::SparseTensorImpl(const Tensor& tensor) + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), + tensor.dim_names_) { + switch (tensor.type()->id()) { + case Type::UINT8: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT8: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT16: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT16: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT32: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT32: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT64: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT64: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::HALF_FLOAT: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::FLOAT: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::DOUBLE: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + default: + break; + } +} + +// ---------------------------------------------------------------------- +// Instantiate templates + +#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensorImpl; \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&) + +INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); +INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex); + +} // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h new file mode 100644 index 0000000000000..c7693d2ec9579 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor.h @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SPARSE_TENSOR_H +#define ARROW_SPARSE_TENSOR_H + +#include +#include +#include + +#include "arrow/tensor.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// SparseIndex class + +/// \brief EXPERIMENTAL: Sparse tensor format enumeration +struct SparseTensorFormat { + enum type { COO, CSR }; +}; + +/// \brief EXPERIMENTAL: The base class for representing index of non-zero +/// values in sparse tensor +class ARROW_EXPORT SparseIndex { + public: + explicit SparseIndex(SparseTensorFormat::type format_id, int64_t non_zero_length) + : format_id_(format_id), non_zero_length_(non_zero_length) {} + + virtual ~SparseIndex() = default; + + SparseTensorFormat::type format_id() const { return format_id_; } + int64_t non_zero_length() const { return non_zero_length_; } + + virtual std::string ToString() const = 0; + + protected: + SparseTensorFormat::type format_id_; + int64_t non_zero_length_; +}; + +template +class SparseIndexBase : public SparseIndex { + public: + explicit SparseIndexBase(int64_t non_zero_length) + : SparseIndex(SparseIndexType::format_id, non_zero_length) {} +}; + +// ---------------------------------------------------------------------- +// SparseCOOIndex class + +/// \brief EXPERIMENTAL: The index data for COO sparse tensor +class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { + public: + using CoordsTensor = NumericTensor; + + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; + + // Constructor with a column-major NumericTensor + explicit SparseCOOIndex(const std::shared_ptr& coords); + + const std::shared_ptr& indices() const { return coords_; } + + std::string ToString() const override; + + bool Equals(const SparseCOOIndex& other) const { + return indices()->Equals(*other.indices()); + } + + protected: + std::shared_ptr coords_; +}; + +// ---------------------------------------------------------------------- +// SparseCSRIndex class + +/// \brief EXPERIMENTAL: The index data for CSR sparse matrix +class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { + public: + using IndexTensor = NumericTensor; + + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; + + // Constructor with two index vectors + explicit SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices); + + const std::shared_ptr& indptr() const { return indptr_; } + const std::shared_ptr& indices() const { return indices_; } + + std::string ToString() const override; + + bool Equals(const SparseCSRIndex& other) const { + return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()); + } + + protected: + std::shared_ptr indptr_; + std::shared_ptr indices_; +}; + +// ---------------------------------------------------------------------- +// SparseTensor class + +/// \brief EXPERIMENTAL: The base class of sparse tensor container +class ARROW_EXPORT SparseTensor { + public: + virtual ~SparseTensor() = default; + + SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); } + + std::shared_ptr type() const { return type_; } + std::shared_ptr data() const { return data_; } + + const uint8_t* raw_data() const { return data_->data(); } + uint8_t* raw_mutable_data() const { return data_->mutable_data(); } + + const std::vector& shape() const { return shape_; } + + const std::shared_ptr& sparse_index() const { return sparse_index_; } + + int ndim() const { return static_cast(shape_.size()); } + + const std::string& dim_name(int i) const; + + /// Total number of value cells in the sparse tensor + int64_t size() const; + + /// Return true if the underlying data buffer is mutable + bool is_mutable() const { return data_->is_mutable(); } + + /// Total number of non-zero cells in the sparse tensor + int64_t non_zero_length() const { + return sparse_index_ ? sparse_index_->non_zero_length() : 0; + } + + bool Equals(const SparseTensor& other) const; + + protected: + // Constructor with all attributes + SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names); + + std::shared_ptr type_; + std::shared_ptr data_; + std::vector shape_; + std::shared_ptr sparse_index_; + + /// These names are optional + std::vector dim_names_; +}; + +// ---------------------------------------------------------------------- +// SparseTensorImpl class + +/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index +/// type +template +class ARROW_EXPORT SparseTensorImpl : public SparseTensor { + public: + virtual ~SparseTensorImpl() = default; + + // Constructor with all attributes + SparseTensorImpl(const std::shared_ptr& sparse_index, + const std::shared_ptr& type, + const std::shared_ptr& data, const std::vector& shape, + const std::vector& dim_names) + : SparseTensor(type, data, shape, sparse_index, dim_names) {} + + // Constructor for empty sparse tensor + SparseTensorImpl(const std::shared_ptr& type, + const std::vector& shape, + const std::vector& dim_names = {}); + + // Constructor with a dense numeric tensor + template + explicit SparseTensorImpl(const NumericTensor& tensor); + + // Constructor with a dense tensor + explicit SparseTensorImpl(const Tensor& tensor); + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); +}; + +/// \brief EXPERIMENTAL: Type alias for COO sparse tensor +using SparseTensorCOO = SparseTensorImpl; + +/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix +using SparseTensorCSR = SparseTensorImpl; +using SparseMatrixCSR = SparseTensorImpl; + +} // namespace arrow + +#endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc index 8be8b36d13bd8..db7f087149017 100644 --- a/cpp/src/arrow/status.cc +++ b/cpp/src/arrow/status.cc @@ -13,6 +13,7 @@ #include "arrow/status.h" #include +#include namespace arrow { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 7280133a65fb9..96b018b650dfa 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -25,32 +25,43 @@ #endif #include "arrow/util/macros.h" +#include "arrow/util/string_builder.h" #include "arrow/util/visibility.h" #ifdef ARROW_EXTRA_ERROR_CONTEXT -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ - return Status(_s.code(), ss.str()); \ - } \ +/// \brief Return with given status if condition is met. +#define ARROW_RETURN_IF_(condition, status, expr) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + ::arrow::Status _s = (status); \ + std::stringstream ss; \ + ss << _s.message() << "\n" << __FILE__ << ":" << __LINE__ << " code: " << expr; \ + return ::arrow::Status(_s.code(), ss.str()); \ + } \ } while (0) #else -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - return _s; \ - } \ - } while (false) +#define ARROW_RETURN_IF_(condition, status, _) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + return (status); \ + } \ + } while (0) #endif // ARROW_EXTRA_ERROR_CONTEXT +#define ARROW_RETURN_IF(condition, status) \ + ARROW_RETURN_IF_(condition, status, ARROW_STRINGIFY(status)) + +/// \brief Propagate any non-successful Status to the caller +#define ARROW_RETURN_NOT_OK(status) \ + do { \ + ::arrow::Status __s = (status); \ + ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \ + } while (false) + #define RETURN_NOT_OK_ELSE(s, else_) \ do { \ ::arrow::Status _s = (s); \ @@ -60,17 +71,6 @@ } \ } while (false) -#define ARROW_RETURN_FAILURE_IF_FALSE(condition, status) \ - do { \ - if (!(condition)) { \ - Status _status = (status); \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << _status.CodeAsString() << " \n " \ - << _status.message(); \ - return Status(_status.code(), ss.str()); \ - } \ - } while (0) - // This is an internal-use macro and should not be used in public headers. #ifndef RETURN_NOT_OK #define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s) @@ -107,10 +107,18 @@ enum class StatusCode : char { class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; #endif +/// \brief Status outcome object (success or error) +/// +/// The Status object is an object holding the outcome of an operation. +/// The outcome is represented as a StatusCode, either success +/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values). +/// +/// Additionally, if an error occurred, a specific error message is generally +/// attached. class ARROW_EXPORT Status { public: // Create a success status. - Status() noexcept : state_(NULL) {} + Status() noexcept : state_(NULLPTR) {} ~Status() noexcept { // ARROW-2400: On certain compilers, splitting off the slow path improves // performance significantly. @@ -122,123 +130,174 @@ class ARROW_EXPORT Status { Status(StatusCode code, const std::string& msg); // Copy the specified status. - Status(const Status& s); - Status& operator=(const Status& s); + inline Status(const Status& s); + inline Status& operator=(const Status& s); // Move the specified status. inline Status(Status&& s) noexcept; - Status& operator=(Status&& s) noexcept; + inline Status& operator=(Status&& s) noexcept; // AND the statuses. - Status operator&(const Status& s) const noexcept; - Status operator&(Status&& s) const noexcept; - Status& operator&=(const Status& s) noexcept; - Status& operator&=(Status&& s) noexcept; + inline Status operator&(const Status& s) const noexcept; + inline Status operator&(Status&& s) const noexcept; + inline Status& operator&=(const Status& s) noexcept; + inline Status& operator&=(Status&& s) noexcept; - // Return a success status. + /// Return a success status static Status OK() { return Status(); } - // Return a success status with extra info - static Status OK(const std::string& msg) { return Status(StatusCode::OK, msg); } + /// Return a success status with a specific message + template + static Status OK(Args&&... args) { + return Status(StatusCode::OK, util::StringBuilder(std::forward(args)...)); + } - // Return error status of an appropriate type. - static Status OutOfMemory(const std::string& msg) { - return Status(StatusCode::OutOfMemory, msg); + /// Return an error status for out-of-memory conditions + template + static Status OutOfMemory(Args&&... args) { + return Status(StatusCode::OutOfMemory, + util::StringBuilder(std::forward(args)...)); } - static Status KeyError(const std::string& msg) { - return Status(StatusCode::KeyError, msg); + /// Return an error status for failed key lookups (e.g. column name in a table) + template + static Status KeyError(Args&&... args) { + return Status(StatusCode::KeyError, util::StringBuilder(std::forward(args)...)); } - static Status TypeError(const std::string& msg) { - return Status(StatusCode::TypeError, msg); + /// Return an error status for type errors (such as mismatching data types) + template + static Status TypeError(Args&&... args) { + return Status(StatusCode::TypeError, + util::StringBuilder(std::forward(args)...)); } - static Status UnknownError(const std::string& msg) { - return Status(StatusCode::UnknownError, msg); + /// Return an error status for unknown errors + template + static Status UnknownError(Args&&... args) { + return Status(StatusCode::UnknownError, + util::StringBuilder(std::forward(args)...)); } - static Status NotImplemented(const std::string& msg) { - return Status(StatusCode::NotImplemented, msg); + /// Return an error status when an operation or a combination of operation and + /// data types is unimplemented + template + static Status NotImplemented(Args&&... args) { + return Status(StatusCode::NotImplemented, + util::StringBuilder(std::forward(args)...)); } - static Status Invalid(const std::string& msg) { - return Status(StatusCode::Invalid, msg); + /// Return an error status for invalid data (for example a string that fails parsing) + template + static Status Invalid(Args&&... args) { + return Status(StatusCode::Invalid, util::StringBuilder(std::forward(args)...)); } - static Status CapacityError(const std::string& msg) { - return Status(StatusCode::CapacityError, msg); + /// Return an error status when a container's capacity would exceed its limits + template + static Status CapacityError(Args&&... args) { + return Status(StatusCode::CapacityError, + util::StringBuilder(std::forward(args)...)); } - static Status IOError(const std::string& msg) { - return Status(StatusCode::IOError, msg); + /// Return an error status when some IO-related operation failed + template + static Status IOError(Args&&... args) { + return Status(StatusCode::IOError, util::StringBuilder(std::forward(args)...)); } - static Status SerializationError(const std::string& msg) { - return Status(StatusCode::SerializationError, msg); + /// Return an error status when some (de)serialization operation failed + template + static Status SerializationError(Args&&... args) { + return Status(StatusCode::SerializationError, + util::StringBuilder(std::forward(args)...)); } - static Status RError(const std::string& msg) { return Status(StatusCode::RError, msg); } + template + static Status RError(Args&&... args) { + return Status(StatusCode::RError, util::StringBuilder(std::forward(args)...)); + } - static Status PlasmaObjectExists(const std::string& msg) { - return Status(StatusCode::PlasmaObjectExists, msg); + template + static Status PlasmaObjectExists(Args&&... args) { + return Status(StatusCode::PlasmaObjectExists, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaObjectNonexistent(const std::string& msg) { - return Status(StatusCode::PlasmaObjectNonexistent, msg); + template + static Status PlasmaObjectNonexistent(Args&&... args) { + return Status(StatusCode::PlasmaObjectNonexistent, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaObjectAlreadySealed(const std::string& msg) { - return Status(StatusCode::PlasmaObjectAlreadySealed, msg); + template + static Status PlasmaObjectAlreadySealed(Args&&... args) { + return Status(StatusCode::PlasmaObjectAlreadySealed, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaStoreFull(const std::string& msg) { - return Status(StatusCode::PlasmaStoreFull, msg); + template + static Status PlasmaStoreFull(Args&&... args) { + return Status(StatusCode::PlasmaStoreFull, + util::StringBuilder(std::forward(args)...)); } static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } - // Return error status of an appropriate type. - static Status CodeGenError(const std::string& msg) { - return Status(StatusCode::CodeGenError, msg); + template + static Status CodeGenError(Args&&... args) { + return Status(StatusCode::CodeGenError, + util::StringBuilder(std::forward(args)...)); } - static Status ExpressionValidationError(const std::string& msg) { - return Status(StatusCode::ExpressionValidationError, msg); + template + static Status ExpressionValidationError(Args&&... args) { + return Status(StatusCode::ExpressionValidationError, + util::StringBuilder(std::forward(args)...)); } - static Status ExecutionError(const std::string& msg) { - return Status(StatusCode::ExecutionError, msg); + template + static Status ExecutionError(Args&&... args) { + return Status(StatusCode::ExecutionError, + util::StringBuilder(std::forward(args)...)); } - // Returns true iff the status indicates success. - bool ok() const { return (state_ == NULL); } + /// Return true iff the status indicates success. + bool ok() const { return (state_ == NULLPTR); } + /// Return true iff the status indicates an out-of-memory error. bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + /// Return true iff the status indicates a key lookup error. bool IsKeyError() const { return code() == StatusCode::KeyError; } + /// Return true iff the status indicates invalid data. bool IsInvalid() const { return code() == StatusCode::Invalid; } + /// Return true iff the status indicates an IO-related failure. bool IsIOError() const { return code() == StatusCode::IOError; } + /// Return true iff the status indicates a container reaching capacity limits. bool IsCapacityError() const { return code() == StatusCode::CapacityError; } + /// Return true iff the status indicates a type error. bool IsTypeError() const { return code() == StatusCode::TypeError; } + /// Return true iff the status indicates an unknown error. bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + /// Return true iff the status indicates an unimplemented operation. bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } - // An object could not be serialized or deserialized. + /// Return true iff the status indicates a (de)serialization failure bool IsSerializationError() const { return code() == StatusCode::SerializationError; } - // An error from R + /// Return true iff the status indicates a R-originated error. bool IsRError() const { return code() == StatusCode::RError; } - // An error is propagated from a nested Python function. + /// Return true iff the status indicates a Python-originated error. bool IsPythonError() const { return code() == StatusCode::PythonError; } - // An object with this object ID already exists in the plasma store. + /// Return true iff the status indicates an already existing Plasma object. bool IsPlasmaObjectExists() const { return code() == StatusCode::PlasmaObjectExists; } - // An object was requested that doesn't exist in the plasma store. + /// Return true iff the status indicates a non-existent Plasma object. bool IsPlasmaObjectNonexistent() const { return code() == StatusCode::PlasmaObjectNonexistent; } - // An already sealed object is tried to be sealed again. + /// Return true iff the status indicates an already sealed Plasma object. bool IsPlasmaObjectAlreadySealed() const { return code() == StatusCode::PlasmaObjectAlreadySealed; } - // An object is too large to fit into the plasma store. + /// Return true iff the status indicates the Plasma store reached its capacity limit. bool IsPlasmaStoreFull() const { return code() == StatusCode::PlasmaStoreFull; } bool IsStillExecuting() const { return code() == StatusCode::StillExecuting; } @@ -251,16 +310,19 @@ class ARROW_EXPORT Status { bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. + /// \brief Return a string representation of this status suitable for printing. + /// + /// The string "OK" is returned for success. std::string ToString() const; - // Return a string representation of the status code, without the message - // text or posix code information. + /// \brief Return a string representation of the status code, without the message + /// text or POSIX code information. std::string CodeAsString() const; + /// \brief Return the StatusCode value attached to this status. StatusCode code() const { return ok() ? StatusCode::OK : state_->code; } + /// \brief Return the specific error message attached to this status. std::string message() const { return ok() ? "" : state_->msg; } private: @@ -274,10 +336,10 @@ class ARROW_EXPORT Status { void DeleteState() { delete state_; - state_ = NULL; + state_ = NULLPTR; } void CopyFrom(const Status& s); - void MoveFrom(Status& s); + inline void MoveFrom(Status& s); }; static inline std::ostream& operator<<(std::ostream& os, const Status& x) { @@ -285,16 +347,16 @@ static inline std::ostream& operator<<(std::ostream& os, const Status& x) { return os; } -inline void Status::MoveFrom(Status& s) { +void Status::MoveFrom(Status& s) { delete state_; state_ = s.state_; - s.state_ = NULL; + s.state_ = NULLPTR; } -inline Status::Status(const Status& s) - : state_((s.state_ == NULL) ? NULL : new State(*s.state_)) {} +Status::Status(const Status& s) + : state_((s.state_ == NULLPTR) ? NULLPTR : new State(*s.state_)) {} -inline Status& Status::operator=(const Status& s) { +Status& Status::operator=(const Status& s) { // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. if (state_ != s.state_) { @@ -303,14 +365,17 @@ inline Status& Status::operator=(const Status& s) { return *this; } -inline Status::Status(Status&& s) noexcept : state_(s.state_) { s.state_ = NULL; } +Status::Status(Status&& s) noexcept : state_(s.state_) { s.state_ = NULLPTR; } -inline Status& Status::operator=(Status&& s) noexcept { +Status& Status::operator=(Status&& s) noexcept { MoveFrom(s); return *this; } -inline Status Status::operator&(const Status& s) const noexcept { +/// \cond FALSE +// (note: emits warnings on Doxygen < 1.8.15, +// see https://github.com/doxygen/doxygen/issues/6295) +Status Status::operator&(const Status& s) const noexcept { if (ok()) { return s; } else { @@ -318,7 +383,7 @@ inline Status Status::operator&(const Status& s) const noexcept { } } -inline Status Status::operator&(Status&& s) const noexcept { +Status Status::operator&(Status&& s) const noexcept { if (ok()) { return std::move(s); } else { @@ -326,19 +391,20 @@ inline Status Status::operator&(Status&& s) const noexcept { } } -inline Status& Status::operator&=(const Status& s) noexcept { +Status& Status::operator&=(const Status& s) noexcept { if (ok() && !s.ok()) { CopyFrom(s); } return *this; } -inline Status& Status::operator&=(Status&& s) noexcept { +Status& Status::operator&=(Status&& s) noexcept { if (ok() && !s.ok()) { MoveFrom(s); } return *this; } +/// \endcond } // namespace arrow diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 04af4d9741c71..d232ac35e30c7 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -234,10 +234,8 @@ Status Column::ValidateData() { for (int i = 0; i < data_->num_chunks(); ++i) { std::shared_ptr type = data_->chunk(i)->type(); if (!this->type()->Equals(type)) { - std::stringstream ss; - ss << "In chunk " << i << " expected type " << this->type()->ToString() - << " but saw " << type->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("In chunk ", i, " expected type ", this->type()->ToString(), + " but saw ", type->ToString()); } } return Status::OK(); @@ -301,10 +299,9 @@ class SimpleTable : public Table { DCHECK(col != nullptr); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match table's length. Expected length ", num_rows_, + " but got length ", col->length()); } std::shared_ptr new_schema; @@ -319,10 +316,9 @@ class SimpleTable : public Table { DCHECK(col != nullptr); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match table's length. Expected length ", num_rows_, + " but got length ", col->length()); } std::shared_ptr new_schema; @@ -363,15 +359,11 @@ class SimpleTable : public Table { for (int i = 0; i < num_columns(); ++i) { const Column* col = columns_[i].get(); if (col == nullptr) { - std::stringstream ss; - ss << "Column " << i << " was null"; - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, " was null"); } if (!col->field()->Equals(*schema_->field(i))) { - std::stringstream ss; - ss << "Column field " << i << " named " << col->name() - << " is inconsistent with schema"; - return Status::Invalid(ss.str()); + return Status::Invalid("Column field ", i, " named ", col->name(), + " is inconsistent with schema"); } } @@ -379,10 +371,8 @@ class SimpleTable : public Table { for (int i = 0; i < num_columns(); ++i) { const Column* col = columns_[i].get(); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Column " << i << " named " << col->name() << " expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, " named ", col->name(), " expected length ", + num_rows_, " but got length ", col->length()); } } return Status::OK(); @@ -392,7 +382,7 @@ class SimpleTable : public Table { std::vector> columns_; }; -Table::Table() {} +Table::Table() : num_rows_(0) {} std::shared_ptr
Table::Make(const std::shared_ptr& schema, const std::vector>& columns, @@ -414,11 +404,9 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, for (int i = 0; i < nbatches; ++i) { if (!batches[i]->schema()->Equals(*schema, false)) { - std::stringstream ss; - ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << batches[i]->schema()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Schema at index ", static_cast(i), + " was different: \n", schema->ToString(), "\nvs\n", + batches[i]->schema()->ToString()); } } @@ -458,11 +446,9 @@ Status ConcatenateTables(const std::vector>& tables, for (int i = 1; i < ntables; ++i) { if (!tables[i]->schema()->Equals(*schema, false)) { - std::stringstream ss; - ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << tables[i]->schema()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Schema at index ", static_cast(i), + " was different: \n", schema->ToString(), "\nvs\n", + tables[i]->schema()->ToString()); } } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 119e4e4491225..2ac34b4cde57d 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -39,7 +39,19 @@ class Status; /// as one large array class ARROW_EXPORT ChunkedArray { public: + /// \brief Construct a chunked array from a vector of arrays + /// + /// The vector should be non-empty and all its elements should have the same + /// data type. explicit ChunkedArray(const ArrayVector& chunks); + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(const std::shared_ptr& chunk) + : ChunkedArray(ArrayVector({chunk})) {} + + /// \brief Construct a chunked array from a vector of arrays and a data type + /// + /// As the data type is passed explicitly, the vector may be empty. ChunkedArray(const ArrayVector& chunks, const std::shared_ptr& type); /// \return the total length of the chunked array; computed on construction @@ -78,7 +90,12 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type() const { return type_; } + /// \brief Determine if two chunked arrays are equal. + /// + /// Two chunked arrays can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. bool Equals(const ChunkedArray& other) const; + /// \brief Determine if two chunked arrays are equal. bool Equals(const std::shared_ptr& other) const; protected: @@ -96,13 +113,26 @@ class ARROW_EXPORT ChunkedArray { /// metadata) and a chunked data array class ARROW_EXPORT Column { public: + /// \brief Construct a column from a vector of arrays + /// + /// The array chunks' datatype must match the field's datatype. Column(const std::shared_ptr& field, const ArrayVector& chunks); + /// \brief Construct a column from a chunked array + /// + /// The chunked array's datatype must match the field's datatype. Column(const std::shared_ptr& field, const std::shared_ptr& data); - + /// \brief Construct a column from a single array + /// + /// The array's datatype must match the field's datatype. Column(const std::shared_ptr& field, const std::shared_ptr& data); - // Construct from name and array + /// \brief Construct a column from a name and an array + /// + /// A field with the given name and the array's datatype is automatically created. Column(const std::string& name, const std::shared_ptr& data); + /// \brief Construct a column from a name and a chunked array + /// + /// A field with the given name and the array's datatype is automatically created. Column(const std::string& name, const std::shared_ptr& data); int64_t length() const { return data_->length(); } @@ -147,7 +177,12 @@ class ARROW_EXPORT Column { /// \param[out] out The resulting vector of arrays Status Flatten(MemoryPool* pool, std::vector>* out) const; + /// \brief Determine if two columns are equal. + /// + /// Two columns can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. bool Equals(const Column& other) const; + /// \brief Determine if the two columns are equal. bool Equals(const std::shared_ptr& other) const; /// \brief Verify that the column's array data is consistent with the passed @@ -207,11 +242,10 @@ class ARROW_EXPORT Table { const std::vector>& batches, std::shared_ptr
* table); - /// \return the table's schema + /// Return the table schema std::shared_ptr schema() const { return schema_; } - /// \param[in] i column index, does not boundscheck - /// \return the i-th column + /// Return a column by index virtual std::shared_ptr column(int i) const = 0; /// \brief Remove column from the table, producing a new Table @@ -243,13 +277,16 @@ class ARROW_EXPORT Table { /// \brief Perform any checks to validate the input arguments virtual Status Validate() const = 0; - /// \return the number of columns in the table + /// \brief Return the number of columns in the table int num_columns() const { return schema_->num_fields(); } - /// \return the number of rows (the corresponding length of each column) + /// \brief Return the number of rows (equal to each column's logical length) int64_t num_rows() const { return num_rows_; } - /// \brief Determine if semantic contents of tables are exactly equal + /// \brief Determine if tables are equal + /// + /// Two tables can be equal only if they have equal schemas. + /// However, they may be equal even if they have different chunkings. bool Equals(const Table& other) const; protected: @@ -262,18 +299,25 @@ class ARROW_EXPORT Table { ARROW_DISALLOW_COPY_AND_ASSIGN(Table); }; -/// \brief Compute a sequence of record batches from a (possibly chunked) Table +/// \brief Compute a stream of record batches from a (possibly chunked) Table +/// +/// The conversion is zero-copy: each record batch is a view over a slice +/// of the table's columns. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: ~TableBatchReader() override; - /// \brief Read batches with the maximum possible size + /// \brief Construct a TableBatchReader for the given table explicit TableBatchReader(const Table& table); std::shared_ptr schema() const override; Status ReadNext(std::shared_ptr* out) override; + /// \brief Set the desired maximum chunk size of record batches + /// + /// The actual chunk size of each record batch may be smaller, depending + /// on actual chunking characteristics of each table column. void set_chunksize(int64_t chunksize); private: @@ -282,7 +326,10 @@ class ARROW_EXPORT TableBatchReader : public RecordBatchReader { }; /// \brief Construct table from multiple input tables. -/// \return Status, fails if any schemas are different +/// +/// The tables are concatenated vertically. Therefore, all tables should +/// have the same schema. Each column in the output table is the result +/// of concatenating the corresponding columns in all input tables. ARROW_EXPORT Status ConcatenateTables(const std::vector>& tables, std::shared_ptr
* table); diff --git a/cpp/src/arrow/tensor-test.cc b/cpp/src/arrow/tensor-test.cc index a437e6db5adaf..af20aed0d6ec1 100644 --- a/cpp/src/arrow/tensor-test.cc +++ b/cpp/src/arrow/tensor-test.cc @@ -104,13 +104,16 @@ TEST(TestTensor, ZeroDimensionalTensor) { ASSERT_EQ(t.strides().size(), 1); } -TEST(TestNumericTensor, ElementAccess) { +TEST(TestNumericTensor, ElementAccessWithRowMajorStrides) { std::vector shape = {3, 4}; std::vector values_i64 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; std::shared_ptr buffer_i64(Buffer::Wrap(values_i64)); NumericTensor t_i64(buffer_i64, shape); + ASSERT_TRUE(t_i64.is_row_major()); + ASSERT_FALSE(t_i64.is_column_major()); + ASSERT_TRUE(t_i64.is_contiguous()); ASSERT_EQ(1, t_i64.Value({0, 0})); ASSERT_EQ(5, t_i64.Value({1, 0})); ASSERT_EQ(6, t_i64.Value({1, 1})); @@ -121,22 +124,27 @@ TEST(TestNumericTensor, ElementAccess) { std::shared_ptr buffer_f32(Buffer::Wrap(values_f32)); NumericTensor t_f32(buffer_f32, shape); + ASSERT_TRUE(t_f32.is_row_major()); + ASSERT_FALSE(t_f32.is_column_major()); + ASSERT_TRUE(t_f32.is_contiguous()); ASSERT_EQ(1.1f, t_f32.Value({0, 0})); ASSERT_EQ(5.1f, t_f32.Value({1, 0})); ASSERT_EQ(6.1f, t_f32.Value({1, 1})); ASSERT_EQ(11.1f, t_f32.Value({2, 2})); } -TEST(TestNumericTensor, ElementAccessWithRowMajorStrides) { +TEST(TestNumericTensor, ElementAccessWithColumnMajorStrides) { std::vector shape = {3, 4}; const int64_t i64_size = sizeof(int64_t); - std::vector values_i64 = {1, 2, 3, 4, 0, 0, 5, 6, 7, - 8, 0, 0, 9, 10, 11, 12, 0, 0}; - std::vector strides_i64 = {i64_size * 6, i64_size}; + std::vector values_i64 = {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12}; + std::vector strides_i64 = {i64_size, i64_size * 3}; std::shared_ptr buffer_i64(Buffer::Wrap(values_i64)); NumericTensor t_i64(buffer_i64, shape, strides_i64); + ASSERT_TRUE(t_i64.is_column_major()); + ASSERT_FALSE(t_i64.is_row_major()); + ASSERT_TRUE(t_i64.is_contiguous()); ASSERT_EQ(1, t_i64.Value({0, 0})); ASSERT_EQ(2, t_i64.Value({0, 1})); ASSERT_EQ(4, t_i64.Value({0, 3})); @@ -145,13 +153,15 @@ TEST(TestNumericTensor, ElementAccessWithRowMajorStrides) { ASSERT_EQ(11, t_i64.Value({2, 2})); const int64_t f32_size = sizeof(float); - std::vector values_f32 = {1.1f, 2.1f, 3.1f, 4.1f, 0.0f, 0.0f, - 5.1f, 6.1f, 7.1f, 8.1f, 0.0f, 0.0f, - 9.1f, 10.1f, 11.1f, 12.1f, 0.0f, 0.0f}; - std::vector strides_f32 = {f32_size * 6, f32_size}; + std::vector values_f32 = {1.1f, 5.1f, 9.1f, 2.1f, 6.1f, 10.1f, + 3.1f, 7.1f, 11.1f, 4.1f, 8.1f, 12.1f}; + std::vector strides_f32 = {f32_size, f32_size * 3}; std::shared_ptr buffer_f32(Buffer::Wrap(values_f32)); NumericTensor t_f32(buffer_f32, shape, strides_f32); + ASSERT_TRUE(t_f32.is_column_major()); + ASSERT_FALSE(t_f32.is_row_major()); + ASSERT_TRUE(t_f32.is_contiguous()); ASSERT_EQ(1.1f, t_f32.Value({0, 0})); ASSERT_EQ(2.1f, t_f32.Value({0, 1})); ASSERT_EQ(4.1f, t_f32.Value({0, 3})); @@ -160,15 +170,19 @@ TEST(TestNumericTensor, ElementAccessWithRowMajorStrides) { ASSERT_EQ(11.1f, t_f32.Value({2, 2})); } -TEST(TestNumericTensor, ElementAccessWithColumnMajorStrides) { +TEST(TestNumericTensor, ElementAccessWithNonContiguousStrides) { std::vector shape = {3, 4}; const int64_t i64_size = sizeof(int64_t); - std::vector values_i64 = {1, 5, 9, 0, 2, 6, 10, 0, 3, 7, 11, 0, 4, 8, 12, 0}; - std::vector strides_i64 = {i64_size, i64_size * 4}; + std::vector values_i64 = {1, 2, 3, 4, 0, 0, 5, 6, 7, + 8, 0, 0, 9, 10, 11, 12, 0, 0}; + std::vector strides_i64 = {i64_size * 6, i64_size}; std::shared_ptr buffer_i64(Buffer::Wrap(values_i64)); NumericTensor t_i64(buffer_i64, shape, strides_i64); + ASSERT_FALSE(t_i64.is_contiguous()); + ASSERT_FALSE(t_i64.is_row_major()); + ASSERT_FALSE(t_i64.is_column_major()); ASSERT_EQ(1, t_i64.Value({0, 0})); ASSERT_EQ(2, t_i64.Value({0, 1})); ASSERT_EQ(4, t_i64.Value({0, 3})); @@ -177,12 +191,16 @@ TEST(TestNumericTensor, ElementAccessWithColumnMajorStrides) { ASSERT_EQ(11, t_i64.Value({2, 2})); const int64_t f32_size = sizeof(float); - std::vector values_f32 = {1.1f, 5.1f, 9.1f, 0.0f, 2.1f, 6.1f, 10.1f, 0.0f, - 3.1f, 7.1f, 11.1f, 0.0f, 4.1f, 8.1f, 12.1f, 0.0f}; - std::vector strides_f32 = {f32_size, f32_size * 4}; + std::vector values_f32 = {1.1f, 2.1f, 3.1f, 4.1f, 0.0f, 0.0f, + 5.1f, 6.1f, 7.1f, 8.1f, 0.0f, 0.0f, + 9.1f, 10.1f, 11.1f, 12.1f, 0.0f, 0.0f}; + std::vector strides_f32 = {f32_size * 6, f32_size}; std::shared_ptr buffer_f32(Buffer::Wrap(values_f32)); NumericTensor t_f32(buffer_f32, shape, strides_f32); + ASSERT_FALSE(t_f32.is_contiguous()); + ASSERT_FALSE(t_f32.is_row_major()); + ASSERT_FALSE(t_f32.is_column_major()); ASSERT_EQ(1.1f, t_f32.Value({0, 0})); ASSERT_EQ(2.1f, t_f32.Value({0, 1})); ASSERT_EQ(4.1f, t_f32.Value({0, 3})); diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 589ee995e2181..a4db298a04d90 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -17,6 +17,7 @@ #include "arrow/tensor.h" +#include #include #include #include @@ -122,50 +123,4 @@ Type::type Tensor::type_id() const { return type_->id(); } bool Tensor::Equals(const Tensor& other) const { return TensorEquals(*this, other); } -// ---------------------------------------------------------------------- -// NumericTensor - -template -NumericTensor::NumericTensor(const std::shared_ptr& data, - const std::vector& shape) - : NumericTensor(data, shape, {}, {}) {} - -template -NumericTensor::NumericTensor(const std::shared_ptr& data, - const std::vector& shape, - const std::vector& strides) - : NumericTensor(data, shape, strides, {}) {} - -template -NumericTensor::NumericTensor(const std::shared_ptr& data, - const std::vector& shape, - const std::vector& strides, - const std::vector& dim_names) - : Tensor(TypeTraits::type_singleton(), data, shape, strides, dim_names) {} - -template -int64_t NumericTensor::CalculateValueOffset( - const std::vector& index) const { - int64_t offset = 0; - for (size_t i = 0; i < index.size(); ++i) { - offset += index[i] * strides_[i]; - } - return offset; -} - -// ---------------------------------------------------------------------- -// Instantiate templates - -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; -template class ARROW_TEMPLATE_EXPORT NumericTensor; - } // namespace arrow diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index a9b5df81fa193..445a81f2cf24c 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -25,6 +25,7 @@ #include "arrow/buffer.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -50,6 +51,9 @@ static inline bool is_tensor_supported(Type::type type_id) { return false; } +template +class SparseTensorImpl; + class ARROW_EXPORT Tensor { public: virtual ~Tensor() = default; @@ -110,27 +114,33 @@ class ARROW_EXPORT Tensor { /// These names are optional std::vector dim_names_; + template + friend class SparseTensorImpl; + private: ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor); }; template -class ARROW_EXPORT NumericTensor : public Tensor { +class NumericTensor : public Tensor { public: using TypeClass = TYPE; using value_type = typename TypeClass::c_type; + /// Constructor with non-negative strides and dimension names + NumericTensor(const std::shared_ptr& data, const std::vector& shape, + const std::vector& strides, + const std::vector& dim_names) + : Tensor(TypeTraits::type_singleton(), data, shape, strides, dim_names) {} + /// Constructor with no dimension names or strides, data assumed to be row-major - NumericTensor(const std::shared_ptr& data, const std::vector& shape); + NumericTensor(const std::shared_ptr& data, const std::vector& shape) + : NumericTensor(data, shape, {}, {}) {} /// Constructor with non-negative strides NumericTensor(const std::shared_ptr& data, const std::vector& shape, - const std::vector& strides); - - /// Constructor with non-negative strides and dimension names - NumericTensor(const std::shared_ptr& data, const std::vector& shape, - const std::vector& strides, - const std::vector& dim_names); + const std::vector& strides) + : NumericTensor(data, shape, strides, {}) {} const value_type& Value(const std::vector& index) const { int64_t offset = CalculateValueOffset(index); @@ -139,7 +149,13 @@ class ARROW_EXPORT NumericTensor : public Tensor { } protected: - int64_t CalculateValueOffset(const std::vector& index) const; + int64_t CalculateValueOffset(const std::vector& index) const { + int64_t offset = 0; + for (size_t i = 0; i < index.size(); ++i) { + offset += index[i] * strides_[i]; + } + return offset; + } }; } // namespace arrow diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 7fb96cda7af73..617c53978f619 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -18,13 +18,12 @@ #include "arrow/test-util.h" #ifndef _WIN32 -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #endif #include -#include #include #include #include @@ -33,31 +32,28 @@ #include #include #include -#include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" -#include "arrow/memory_pool.h" +#include "arrow/ipc/json-simple.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" -void sleep_for(double seconds) { - std::this_thread::sleep_for( - std::chrono::nanoseconds(static_cast(seconds * 1e9))); -} - namespace arrow { +std::shared_ptr ArrayFromJSON(const std::shared_ptr& type, + const std::string& json) { + std::shared_ptr out; + ABORT_NOT_OK(ipc::internal::json::ArrayFromJSON(type, json, &out)); + return out; +} + void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { const int random_seed = 0; std::default_random_engine gen(random_seed); @@ -307,17 +303,23 @@ void AssertZeroPadded(const Array& array) { for (const auto& buffer : array.data()->buffers) { if (buffer) { const int64_t padding = buffer->capacity() - buffer->size(); - std::vector zeros(padding); - ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding)); + if (padding > 0) { + std::vector zeros(padding); + ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding)); + } } } } void TestInitialized(const Array& array) { for (const auto& buffer : array.data()->buffers) { - if (buffer) { - std::vector zeros(buffer->capacity()); - throw_away = memcmp(buffer->data(), zeros.data(), buffer->size()); + if (buffer && buffer->capacity() > 0) { + int total = 0; + auto data = buffer->data(); + for (int64_t i = 0; i < buffer->size(); ++i) { + total ^= data[i]; + } + throw_away = total; } } } diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index a01fd7d84a601..713ff38ca5283 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -17,23 +17,17 @@ #pragma once -#ifndef _WIN32 -#include -#include -#include -#endif - #include -#include #include #include +#include #include #include #include #include #include #include -#include +#include #include #include @@ -43,44 +37,42 @@ #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" -#define STRINGIFY(x) #x - -#define ASSERT_RAISES(ENUM, expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.Is##ENUM()) { \ - FAIL() << "Expected '" STRINGIFY(expr) "' to fail with " STRINGIFY( \ - ENUM) ", but got " \ - << s.ToString(); \ - } \ +#define ASSERT_RAISES(ENUM, expr) \ + do { \ + ::arrow::Status s = (expr); \ + if (!s.Is##ENUM()) { \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << s.ToString(); \ + } \ } while (false) -#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.Is##ENUM()) { \ - FAIL() << "Expected '" STRINGIFY(expr) "' to fail with " STRINGIFY( \ - ENUM) ", but got " \ - << s.ToString(); \ - } \ - ASSERT_EQ((message), s.ToString()); \ +#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ + do { \ + ::arrow::Status s = (expr); \ + if (!s.Is##ENUM()) { \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << s.ToString(); \ + } \ + ASSERT_EQ((message), s.ToString()); \ } while (false) -#define ASSERT_OK(expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.ok()) { \ - FAIL() << "'" STRINGIFY(expr) "' failed with " << s.ToString(); \ - } \ +#define ASSERT_OK(expr) \ + do { \ + ::arrow::Status _s = (expr); \ + if (!_s.ok()) { \ + FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _s.ToString(); \ + } \ } while (false) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) @@ -102,6 +94,10 @@ namespace arrow { +class ChunkedArray; +class Column; +class Table; + using ArrayVector = std::vector>; #define ASSERT_ARRAYS_EQUAL(LEFT, RIGHT) \ @@ -169,6 +165,12 @@ static inline Status GetBitmapFromVector(const std::vector& is_valid, return Status::OK(); } +template +inline void BitmapFromVector(const std::vector& is_valid, + std::shared_ptr* out) { + ASSERT_OK(GetBitmapFromVector(is_valid, out)); +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. ARROW_EXPORT void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes); @@ -200,6 +202,15 @@ ARROW_EXPORT void PrintColumn(const Column& col, std::stringstream* ss); ARROW_EXPORT void AssertTablesEqual(const Table& expected, const Table& actual, bool same_chunk_layout = true); +template +void AssertNumericDataEqual(const C_TYPE* raw_data, + const std::vector& expected_values) { + for (auto expected : expected_values) { + ASSERT_EQ(expected, *raw_data); + ++raw_data; + } +} + ARROW_EXPORT void CompareBatch(const RecordBatch& left, const RecordBatch& right); // Check if the padding of the buffers of the array is zero. @@ -247,6 +258,12 @@ Status MakeRandomBuffer(int64_t length, MemoryPool* pool, return Status::OK(); } +// ArrayFromJSON: construct an Array from a simple JSON representation + +ARROW_EXPORT +std::shared_ptr ArrayFromJSON(const std::shared_ptr&, + const std::string& json); + // ArrayFromVector: construct an Array from vectors of C values template diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index e0a10690c2c77..957c7632149f8 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -24,6 +24,8 @@ #include +#include "arrow/memory_pool.h" +#include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -56,6 +58,7 @@ TEST(TestField, Equals) { ASSERT_TRUE(f0.Equals(f0_other)); ASSERT_FALSE(f0.Equals(f0_nn)); ASSERT_FALSE(f0.Equals(f0_with_meta)); + ASSERT_TRUE(f0.Equals(f0_with_meta, false)); } TEST(TestField, TestMetadataConstruction) { @@ -198,28 +201,31 @@ TEST_F(TestSchema, GetFieldIndex) { } TEST_F(TestSchema, TestMetadataConstruction) { - auto f0 = field("f0", int32()); - auto f1 = field("f1", uint8(), false); - auto f2 = field("f2", utf8()); auto metadata0 = key_value_metadata({{"foo", "bar"}, {"bizz", "buzz"}}); auto metadata1 = key_value_metadata({{"foo", "baz"}}); - auto schema0 = ::arrow::schema({f0, f1, f2}, metadata0); - ASSERT_TRUE(metadata0->Equals(*schema0->metadata())); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8(), true); + auto f3 = field("f2", utf8(), true, metadata1->Copy()); + auto schema0 = ::arrow::schema({f0, f1, f2}, metadata0); auto schema1 = ::arrow::schema({f0, f1, f2}, metadata1); - ASSERT_TRUE(metadata1->Equals(*schema1->metadata())); - auto schema2 = ::arrow::schema({f0, f1, f2}, metadata0->Copy()); - ASSERT_TRUE(metadata0->Equals(*schema2->metadata())); + auto schema3 = ::arrow::schema({f0, f1, f3}, metadata0->Copy()); + ASSERT_TRUE(metadata0->Equals(*schema0->metadata())); + ASSERT_TRUE(metadata1->Equals(*schema1->metadata())); + ASSERT_TRUE(metadata0->Equals(*schema2->metadata())); ASSERT_TRUE(schema0->Equals(*schema2)); ASSERT_FALSE(schema0->Equals(*schema1)); ASSERT_FALSE(schema2->Equals(*schema1)); + ASSERT_FALSE(schema2->Equals(*schema3)); // don't check metadata ASSERT_TRUE(schema0->Equals(*schema1, false)); ASSERT_TRUE(schema2->Equals(*schema1, false)); + ASSERT_TRUE(schema2->Equals(*schema3, false)); } TEST_F(TestSchema, TestAddMetadata) { @@ -256,19 +262,19 @@ TEST_F(TestSchema, TestRemoveMetadata) { ASSERT_EQ(tp.ToString(), std::string(NAME)); \ } -PRIMITIVE_TEST(Int8Type, INT8, "int8"); -PRIMITIVE_TEST(Int16Type, INT16, "int16"); -PRIMITIVE_TEST(Int32Type, INT32, "int32"); -PRIMITIVE_TEST(Int64Type, INT64, "int64"); -PRIMITIVE_TEST(UInt8Type, UINT8, "uint8"); -PRIMITIVE_TEST(UInt16Type, UINT16, "uint16"); -PRIMITIVE_TEST(UInt32Type, UINT32, "uint32"); -PRIMITIVE_TEST(UInt64Type, UINT64, "uint64"); +PRIMITIVE_TEST(Int8Type, INT8, "int8") +PRIMITIVE_TEST(Int16Type, INT16, "int16") +PRIMITIVE_TEST(Int32Type, INT32, "int32") +PRIMITIVE_TEST(Int64Type, INT64, "int64") +PRIMITIVE_TEST(UInt8Type, UINT8, "uint8") +PRIMITIVE_TEST(UInt16Type, UINT16, "uint16") +PRIMITIVE_TEST(UInt32Type, UINT32, "uint32") +PRIMITIVE_TEST(UInt64Type, UINT64, "uint64") -PRIMITIVE_TEST(FloatType, FLOAT, "float"); -PRIMITIVE_TEST(DoubleType, DOUBLE, "double"); +PRIMITIVE_TEST(FloatType, FLOAT, "float") +PRIMITIVE_TEST(DoubleType, DOUBLE, "double") -PRIMITIVE_TEST(BooleanType, BOOL, "bool"); +PRIMITIVE_TEST(BooleanType, BOOL, "bool") TEST(TestBinaryType, ToString) { BinaryType t1; @@ -446,7 +452,7 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } -TEST(TestStructType, GetChildByName) { +TEST(TestStructType, GetFieldByName) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); @@ -455,17 +461,17 @@ TEST(TestStructType, GetChildByName) { StructType struct_type({f0, f1, f2, f3}); std::shared_ptr result; - result = struct_type.GetChildByName("f1"); + result = struct_type.GetFieldByName("f1"); ASSERT_EQ(f1, result); - result = struct_type.GetChildByName("f3"); + result = struct_type.GetFieldByName("f3"); ASSERT_EQ(f3, result); - result = struct_type.GetChildByName("not-found"); + result = struct_type.GetFieldByName("not-found"); ASSERT_EQ(result, nullptr); } -TEST(TestStructType, GetChildIndex) { +TEST(TestStructType, GetFieldIndex) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); @@ -473,11 +479,147 @@ TEST(TestStructType, GetChildIndex) { StructType struct_type({f0, f1, f2, f3}); - ASSERT_EQ(0, struct_type.GetChildIndex(f0->name())); - ASSERT_EQ(1, struct_type.GetChildIndex(f1->name())); - ASSERT_EQ(2, struct_type.GetChildIndex(f2->name())); - ASSERT_EQ(3, struct_type.GetChildIndex(f3->name())); - ASSERT_EQ(-1, struct_type.GetChildIndex("not-found")); + ASSERT_EQ(0, struct_type.GetFieldIndex(f0->name())); + ASSERT_EQ(1, struct_type.GetFieldIndex(f1->name())); + ASSERT_EQ(2, struct_type.GetFieldIndex(f2->name())); + ASSERT_EQ(3, struct_type.GetFieldIndex(f3->name())); + ASSERT_EQ(-1, struct_type.GetFieldIndex("not-found")); +} + +TEST(TestStructType, GetFieldIndexDuplicates) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int64()); + auto f2 = field("f1", utf8()); + StructType struct_type({f0, f1, f2}); + + ASSERT_EQ(0, struct_type.GetFieldIndex("f0")); + ASSERT_EQ(-1, struct_type.GetFieldIndex("f1")); +} + +TEST(TestDictionaryType, Equals) { + auto t1 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t3 = dictionary(int16(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t4 = dictionary(int8(), ArrayFromJSON(int16(), "[3, 4, 5, 6]")); + auto t5 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 7, 6]")); + + ASSERT_TRUE(t1->Equals(t2)); + // Different index type + ASSERT_FALSE(t1->Equals(t3)); + // Different value type + ASSERT_FALSE(t1->Equals(t4)); + // Different values + ASSERT_FALSE(t1->Equals(t5)); +} + +TEST(TestDictionaryType, UnifyNumeric) { + auto t1 = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int64(), "[1, 7, 4, 8]")); + auto t3 = dictionary(int8(), ArrayFromJSON(int64(), "[1, -200]")); + + auto expected = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7, 1, 8, -200]")); + + std::shared_ptr dict_type; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type, &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 3); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1, 2})); + ASSERT_EQ(transpose_maps[1], std::vector({3, 2, 1, 4})); + ASSERT_EQ(transpose_maps[2], std::vector({3, 5})); +} + +TEST(TestDictionaryType, UnifyString) { + auto t1 = dictionary(int16(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\"]")); + auto t2 = dictionary(int32(), ArrayFromJSON(utf8(), "[\"quux\", \"foo\"]")); + + auto expected = + dictionary(int8(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"quux\"]")); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({2, 0})); +} + +TEST(TestDictionaryType, UnifyFixedSizeBinary) { + auto type = fixed_size_binary(3); + + std::string data = "foobarbazqux"; + auto buf = std::make_shared(data); + // ["foo", "bar"] + auto dict1 = std::make_shared(type, 2, SliceBuffer(buf, 0, 6)); + auto t1 = dictionary(int16(), dict1); + // ["bar", "baz", "qux"] + auto dict2 = std::make_shared(type, 3, SliceBuffer(buf, 3, 9)); + auto t2 = dictionary(int16(), dict2); + + // ["foo", "bar", "baz", "qux"] + auto expected_dict = std::make_shared(type, 4, buf); + auto expected = dictionary(int8(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({1, 2, 3})); +} + +TEST(TestDictionaryType, UnifyLarge) { + // Unifying "large" dictionary types should choose the right index type + std::shared_ptr dict1, dict2, expected_dict; + + Int32Builder builder; + ASSERT_OK(builder.Reserve(120)); + for (int32_t i = 0; i < 120; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict1)); + ASSERT_EQ(dict1->length(), 120); + auto t1 = dictionary(int8(), dict1); + + ASSERT_OK(builder.Reserve(30)); + for (int32_t i = 110; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict2)); + ASSERT_EQ(dict2->length(), 30); + auto t2 = dictionary(int8(), dict2); + + ASSERT_OK(builder.Reserve(140)); + for (int32_t i = 0; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&expected_dict)); + ASSERT_EQ(expected_dict->length(), 140); + // int8 would be too narrow to hold all possible index values + auto expected = dictionary(int16(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); } TEST(TypesTest, TestDecimal128Small) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5f1ca8d7b0f09..15f353d4d1f6a 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -65,13 +65,15 @@ std::vector> Field::Flatten() const { return flattened; } -bool Field::Equals(const Field& other) const { +bool Field::Equals(const Field& other, bool check_metadata) const { if (this == &other) { return true; } if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && this->type_->Equals(*other.type_.get())) { - if (this->HasMetadata() && other.HasMetadata()) { + if (!check_metadata) { + return true; + } else if (this->HasMetadata() && other.HasMetadata()) { return metadata_->Equals(*other.metadata_); } else if (!this->HasMetadata() && !other.HasMetadata()) { return true; @@ -82,8 +84,8 @@ bool Field::Equals(const Field& other) const { return false; } -bool Field::Equals(const std::shared_ptr& other) const { - return Equals(*other.get()); +bool Field::Equals(const std::shared_ptr& other, bool check_metadata) const { + return Equals(*other.get(), check_metadata); } std::string Field::ToString() const { @@ -135,12 +137,11 @@ std::string FixedSizeBinaryType::ToString() const { // ---------------------------------------------------------------------- // Date types -DateType::DateType(Type::type type_id, DateUnit unit) - : FixedWidthType(type_id), unit_(unit) {} +DateType::DateType(Type::type type_id) : FixedWidthType(type_id) {} -Date32Type::Date32Type() : DateType(Type::DATE32, DateUnit::DAY) {} +Date32Type::Date32Type() : DateType(Type::DATE32) {} -Date64Type::Date64Type() : DateType(Type::DATE64, DateUnit::MILLI) {} +Date64Type::Date64Type() : DateType(Type::DATE64) {} std::string Date64Type::ToString() const { return std::string("date64[ms]"); } @@ -218,6 +219,24 @@ std::string UnionType::ToString() const { // ---------------------------------------------------------------------- // Struct type +namespace { + +std::unordered_map CreateNameToIndexMap( + const std::vector>& fields) { + std::unordered_map name_to_index; + for (size_t i = 0; i < fields.size(); ++i) { + name_to_index[fields[i]->name()] = static_cast(i); + } + return name_to_index; +} + +} // namespace + +StructType::StructType(const std::vector>& fields) + : NestedType(Type::STRUCT), name_to_index_(CreateNameToIndexMap(fields)) { + children_ = fields; +} + std::string StructType::ToString() const { std::stringstream s; s << "struct<"; @@ -232,15 +251,28 @@ std::string StructType::ToString() const { return s.str(); } -std::shared_ptr StructType::GetChildByName(const std::string& name) const { - int i = GetChildIndex(name); +std::shared_ptr StructType::GetFieldByName(const std::string& name) const { + int i = GetFieldIndex(name); return i == -1 ? nullptr : children_[i]; } -int StructType::GetChildIndex(const std::string& name) const { - if (children_.size() > 0 && name_to_index_.size() == 0) { +int StructType::GetFieldIndex(const std::string& name) const { + if (name_to_index_.size() < children_.size()) { + // There are duplicate field names. Refuse to guess + int counts = 0; + int last_observed_index = -1; for (size_t i = 0; i < children_.size(); ++i) { - name_to_index_[children_[i]->name()] = static_cast(i); + if (children_[i]->name() == name) { + ++counts; + last_observed_index = static_cast(i); + } + } + + if (counts == 1) { + return last_observed_index; + } else { + // Duplicate or not found + return -1; } } @@ -252,6 +284,14 @@ int StructType::GetChildIndex(const std::string& name) const { } } +std::shared_ptr StructType::GetChildByName(const std::string& name) const { + return GetFieldByName(name); +} + +int StructType::GetChildIndex(const std::string& name) const { + return GetFieldIndex(name); +} + // ---------------------------------------------------------------------- // DictionaryType @@ -260,7 +300,12 @@ DictionaryType::DictionaryType(const std::shared_ptr& index_type, : FixedWidthType(Type::DICTIONARY), index_type_(index_type), dictionary_(dictionary), - ordered_(ordered) {} + ordered_(ordered) { +#ifndef NDEBUG + const auto& int_type = checked_cast(*index_type); + DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed"; +#endif +} int DictionaryType::bit_width() const { return checked_cast(*index_type_).bit_width(); @@ -285,11 +330,15 @@ std::string NullType::ToString() const { return name(); } Schema::Schema(const std::vector>& fields, const std::shared_ptr& metadata) - : fields_(fields), metadata_(metadata) {} + : fields_(fields), + name_to_index_(CreateNameToIndexMap(fields_)), + metadata_(metadata) {} Schema::Schema(std::vector>&& fields, const std::shared_ptr& metadata) - : fields_(std::move(fields)), metadata_(metadata) {} + : fields_(std::move(fields)), + name_to_index_(CreateNameToIndexMap(fields_)), + metadata_(metadata) {} bool Schema::Equals(const Schema& other, bool check_metadata) const { if (this == &other) { @@ -301,7 +350,7 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const { return false; } for (int i = 0; i < num_fields(); ++i) { - if (!field(i)->Equals(*other.field(i).get())) { + if (!field(i)->Equals(*other.field(i).get(), check_metadata)) { return false; } } @@ -324,12 +373,6 @@ std::shared_ptr Schema::GetFieldByName(const std::string& name) const { } int64_t Schema::GetFieldIndex(const std::string& name) const { - if (fields_.size() > 0 && name_to_index_.size() == 0) { - for (size_t i = 0; i < fields_.size(); ++i) { - name_to_index_[fields_[i]->name()] = static_cast(i); - } - } - auto it = name_to_index_.find(name); if (it == name_to_index_.end()) { return -1; @@ -419,22 +462,22 @@ std::shared_ptr schema(std::vector>&& fields, #define ACCEPT_VISITOR(TYPE) \ Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); } -ACCEPT_VISITOR(NullType); -ACCEPT_VISITOR(BooleanType); -ACCEPT_VISITOR(BinaryType); -ACCEPT_VISITOR(FixedSizeBinaryType); -ACCEPT_VISITOR(StringType); -ACCEPT_VISITOR(ListType); -ACCEPT_VISITOR(StructType); -ACCEPT_VISITOR(Decimal128Type); -ACCEPT_VISITOR(UnionType); -ACCEPT_VISITOR(Date32Type); -ACCEPT_VISITOR(Date64Type); -ACCEPT_VISITOR(Time32Type); -ACCEPT_VISITOR(Time64Type); -ACCEPT_VISITOR(TimestampType); -ACCEPT_VISITOR(IntervalType); -ACCEPT_VISITOR(DictionaryType); +ACCEPT_VISITOR(NullType) +ACCEPT_VISITOR(BooleanType) +ACCEPT_VISITOR(BinaryType) +ACCEPT_VISITOR(FixedSizeBinaryType) +ACCEPT_VISITOR(StringType) +ACCEPT_VISITOR(ListType) +ACCEPT_VISITOR(StructType) +ACCEPT_VISITOR(Decimal128Type) +ACCEPT_VISITOR(UnionType) +ACCEPT_VISITOR(Date32Type) +ACCEPT_VISITOR(Date64Type) +ACCEPT_VISITOR(Time32Type) +ACCEPT_VISITOR(Time64Type) +ACCEPT_VISITOR(TimestampType) +ACCEPT_VISITOR(IntervalType) +ACCEPT_VISITOR(DictionaryType) #define TYPE_FACTORY(NAME, KLASS) \ std::shared_ptr NAME() { \ @@ -442,23 +485,23 @@ ACCEPT_VISITOR(DictionaryType); return result; \ } -TYPE_FACTORY(null, NullType); -TYPE_FACTORY(boolean, BooleanType); -TYPE_FACTORY(int8, Int8Type); -TYPE_FACTORY(uint8, UInt8Type); -TYPE_FACTORY(int16, Int16Type); -TYPE_FACTORY(uint16, UInt16Type); -TYPE_FACTORY(int32, Int32Type); -TYPE_FACTORY(uint32, UInt32Type); -TYPE_FACTORY(int64, Int64Type); -TYPE_FACTORY(uint64, UInt64Type); -TYPE_FACTORY(float16, HalfFloatType); -TYPE_FACTORY(float32, FloatType); -TYPE_FACTORY(float64, DoubleType); -TYPE_FACTORY(utf8, StringType); -TYPE_FACTORY(binary, BinaryType); -TYPE_FACTORY(date64, Date64Type); -TYPE_FACTORY(date32, Date32Type); +TYPE_FACTORY(null, NullType) +TYPE_FACTORY(boolean, BooleanType) +TYPE_FACTORY(int8, Int8Type) +TYPE_FACTORY(uint8, UInt8Type) +TYPE_FACTORY(int16, Int16Type) +TYPE_FACTORY(uint16, UInt16Type) +TYPE_FACTORY(int32, Int32Type) +TYPE_FACTORY(uint32, UInt32Type) +TYPE_FACTORY(int64, Int64Type) +TYPE_FACTORY(uint64, UInt64Type) +TYPE_FACTORY(float16, HalfFloatType) +TYPE_FACTORY(float32, FloatType) +TYPE_FACTORY(float64, DoubleType) +TYPE_FACTORY(utf8, StringType) +TYPE_FACTORY(binary, BinaryType) +TYPE_FACTORY(date64, Date64Type) +TYPE_FACTORY(date32, Date32Type) std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 63f0e2d237242..752fc85fb9504 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -39,12 +39,13 @@ namespace arrow { class Array; class Field; +class MemoryPool; -/// \brief Main data type enumeration -/// -/// This enumeration provides a quick way to interrogate the category -/// of a DataType instance. struct Type { + /// \brief Main data type enumeration + /// + /// This enumeration provides a quick way to interrogate the category + /// of a DataType instance. enum type { /// A NULL type having no physical storage NA, @@ -143,18 +144,19 @@ struct Type { /// nested type consisting of other data types, or another data type (e.g. a /// timestamp encoded as an int64). /// -/// Simple datatypes may be entirely described by their Type id, but +/// Simple datatypes may be entirely described by their Type::type id, but /// complex datatypes are usually parametric. class ARROW_EXPORT DataType { public: explicit DataType(Type::type id) : id_(id) {} virtual ~DataType(); - // Return whether the types are equal - // - // Types that are logically convertible from one to another (e.g. List - // and Binary) are NOT equal. + /// \brief Return whether the types are equal + /// + /// Types that are logically convertible from one to another (e.g. List + /// and Binary) are NOT equal. virtual bool Equals(const DataType& other) const; + /// \brief Return whether the types are equal bool Equals(const std::shared_ptr& other) const; std::shared_ptr child(int i) const { return children_[i]; } @@ -174,6 +176,7 @@ class ARROW_EXPORT DataType { /// \since 0.7.0 virtual std::string name() const = 0; + /// \brief Return the type category Type::type id() const { return id_; } protected: @@ -248,23 +251,31 @@ class ARROW_EXPORT Field { const std::shared_ptr& metadata = NULLPTR) : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} + /// \brief Return the field's attached metadata std::shared_ptr metadata() const { return metadata_; } + /// \brief Return whether the field has non-empty metadata bool HasMetadata() const; + /// \brief Return a copy of this field with the given metadata attached to it std::shared_ptr AddMetadata( const std::shared_ptr& metadata) const; + /// \brief Return a copy of this field without any metadata attached to it std::shared_ptr RemoveMetadata() const; std::vector> Flatten() const; - bool Equals(const Field& other) const; - bool Equals(const std::shared_ptr& other) const; + bool Equals(const Field& other, bool check_metadata = true) const; + bool Equals(const std::shared_ptr& other, bool check_metadata = true) const; + /// \brief Return a string representation ot the field std::string ToString() const; + /// \brief Return the field name const std::string& name() const { return name_; } + /// \brief Return the field data type std::shared_ptr type() const { return type_; } + /// \brief Return whether the field is nullable bool nullable() const { return nullable_; } private: @@ -495,24 +506,27 @@ class ARROW_EXPORT StructType : public NestedType { public: static constexpr Type::type type_id = Type::STRUCT; - explicit StructType(const std::vector>& fields) - : NestedType(Type::STRUCT) { - children_ = fields; - } + explicit StructType(const std::vector>& fields); Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; std::string name() const override { return "struct"; } /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Returns -1 if name not found or if there are multiple fields having the + /// same name + int GetFieldIndex(const std::string& name) const; + + ARROW_DEPRECATED("Use GetFieldByName") std::shared_ptr GetChildByName(const std::string& name) const; - /// Returns -1 if name not found + ARROW_DEPRECATED("Use GetFieldIndex") int GetChildIndex(const std::string& name) const; private: - /// Lazily initialized mapping - mutable std::unordered_map name_to_index_; + std::unordered_map name_to_index_; }; /// \brief Base type class for (fixed-size) decimal data @@ -582,17 +596,17 @@ enum class DateUnit : char { DAY = 0, MILLI = 1 }; /// \brief Base type class for date data class ARROW_EXPORT DateType : public FixedWidthType { public: - DateUnit unit() const { return unit_; } + virtual DateUnit unit() const = 0; protected: - DateType(Type::type type_id, DateUnit unit); - DateUnit unit_; + explicit DateType(Type::type type_id); }; /// Concrete type class for 32-bit date data (as number of days since UNIX epoch) class ARROW_EXPORT Date32Type : public DateType { public: static constexpr Type::type type_id = Type::DATE32; + static constexpr DateUnit UNIT = DateUnit::DAY; using c_type = int32_t; @@ -604,12 +618,14 @@ class ARROW_EXPORT Date32Type : public DateType { std::string ToString() const override; std::string name() const override { return "date32"; } + DateUnit unit() const override { return UNIT; } }; /// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch) class ARROW_EXPORT Date64Type : public DateType { public: static constexpr Type::type type_id = Type::DATE64; + static constexpr DateUnit UNIT = DateUnit::MILLI; using c_type = int64_t; @@ -621,9 +637,11 @@ class ARROW_EXPORT Date64Type : public DateType { std::string ToString() const override; std::string name() const override { return "date64"; } + DateUnit unit() const override { return UNIT; } }; struct TimeUnit { + /// The unit for a time or timestamp DataType enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; }; @@ -757,6 +775,23 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { bool ordered() const { return ordered_; } + /// \brief Unify several dictionary types + /// + /// Compute a resulting dictionary that will allow the union of values + /// of all input dictionary types. The input types must all have the + /// same value type. + /// \param[in] pool Memory pool to allocate dictionary values from + /// \param[in] types A sequence of input dictionary types + /// \param[out] out_type The unified dictionary type + /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, + /// one per input type. Each integer vector represents the transposition + /// of input type indices into unified type indices. + // XXX Should we return something special (an empty transpose map?) when + // the transposition is the identity function? + static Status Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps = NULLPTR); + private: // Must be an integer type (not currently checked) std::shared_ptr index_type_; @@ -827,8 +862,7 @@ class ARROW_EXPORT Schema { private: std::vector> fields_; - /// Lazily initialized mapping - mutable std::unordered_map name_to_index_; + std::unordered_map name_to_index_; std::shared_ptr metadata_; }; @@ -837,6 +871,9 @@ class ARROW_EXPORT Schema { // Parametric factory functions // Other factory functions are in type_fwd.h +/// \addtogroup type-factories +/// @{ + /// \brief Create a FixedSizeBinaryType instance ARROW_EXPORT std::shared_ptr fixed_size_binary(int32_t byte_width); @@ -890,6 +927,13 @@ std::shared_ptr ARROW_EXPORT dictionary(const std::shared_ptr& index_type, const std::shared_ptr& values, bool ordered = false); +/// @} + +/// \defgroup schema-factories Factory functions for fields and schemas +/// +/// Factory functions for fields and schemas +/// @{ + /// \brief Create a Field instance /// /// \param name the field name @@ -920,6 +964,8 @@ std::shared_ptr schema( std::vector>&& fields, const std::shared_ptr& metadata = NULLPTR); +/// @} + } // namespace arrow #endif // ARROW_TYPE_H diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index dbbe7092b4f12..2593a4f7e9947 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -96,17 +96,17 @@ class NumericTensor; using KLASS##Builder = NumericBuilder; \ using KLASS##Tensor = NumericTensor; -_NUMERIC_TYPE_DECL(Int8); -_NUMERIC_TYPE_DECL(Int16); -_NUMERIC_TYPE_DECL(Int32); -_NUMERIC_TYPE_DECL(Int64); -_NUMERIC_TYPE_DECL(UInt8); -_NUMERIC_TYPE_DECL(UInt16); -_NUMERIC_TYPE_DECL(UInt32); -_NUMERIC_TYPE_DECL(UInt64); -_NUMERIC_TYPE_DECL(HalfFloat); -_NUMERIC_TYPE_DECL(Float); -_NUMERIC_TYPE_DECL(Double); +_NUMERIC_TYPE_DECL(Int8) +_NUMERIC_TYPE_DECL(Int16) +_NUMERIC_TYPE_DECL(Int32) +_NUMERIC_TYPE_DECL(Int64) +_NUMERIC_TYPE_DECL(UInt8) +_NUMERIC_TYPE_DECL(UInt16) +_NUMERIC_TYPE_DECL(UInt32) +_NUMERIC_TYPE_DECL(UInt64) +_NUMERIC_TYPE_DECL(HalfFloat) +_NUMERIC_TYPE_DECL(Float) +_NUMERIC_TYPE_DECL(Double) #undef _NUMERIC_TYPE_DECL @@ -137,6 +137,11 @@ using IntervalArray = NumericArray; // (parameter-free) Factory functions // Other factory functions are in type.h +/// \defgroup type-factories Factory functions for creating data types +/// +/// Factory functions for creating data types +/// @{ + /// \brief Return a NullType instance std::shared_ptr ARROW_EXPORT null(); /// \brief Return a BooleanType instance @@ -172,6 +177,8 @@ std::shared_ptr ARROW_EXPORT date32(); /// \brief Return a Date64Type instance std::shared_ptr ARROW_EXPORT date64(); +/// @} + } // namespace arrow #endif // ARROW_TYPE_FWD_H diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index da5cf25f5eed1..edb8ca166f6ee 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -45,7 +45,7 @@ struct TypeTraits { using ArrayType = UInt8Array; using BuilderType = UInt8Builder; using TensorType = UInt8Tensor; - static inline int64_t bytes_required(int64_t elements) { return elements; } + static constexpr int64_t bytes_required(int64_t elements) { return elements; } constexpr static bool is_parameter_free = true; static inline std::shared_ptr type_singleton() { return uint8(); } }; @@ -55,7 +55,7 @@ struct TypeTraits { using ArrayType = Int8Array; using BuilderType = Int8Builder; using TensorType = Int8Tensor; - static inline int64_t bytes_required(int64_t elements) { return elements; } + static constexpr int64_t bytes_required(int64_t elements) { return elements; } constexpr static bool is_parameter_free = true; static inline std::shared_ptr type_singleton() { return int8(); } }; @@ -66,7 +66,7 @@ struct TypeTraits { using BuilderType = UInt16Builder; using TensorType = UInt16Tensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(uint16_t); } constexpr static bool is_parameter_free = true; @@ -79,7 +79,7 @@ struct TypeTraits { using BuilderType = Int16Builder; using TensorType = Int16Tensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int16_t); } constexpr static bool is_parameter_free = true; @@ -92,7 +92,7 @@ struct TypeTraits { using BuilderType = UInt32Builder; using TensorType = UInt32Tensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(uint32_t); } constexpr static bool is_parameter_free = true; @@ -105,7 +105,7 @@ struct TypeTraits { using BuilderType = Int32Builder; using TensorType = Int32Tensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int32_t); } constexpr static bool is_parameter_free = true; @@ -118,7 +118,7 @@ struct TypeTraits { using BuilderType = UInt64Builder; using TensorType = UInt64Tensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(uint64_t); } constexpr static bool is_parameter_free = true; @@ -131,7 +131,7 @@ struct TypeTraits { using BuilderType = Int64Builder; using TensorType = Int64Tensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int64_t); } constexpr static bool is_parameter_free = true; @@ -143,7 +143,7 @@ struct TypeTraits { using ArrayType = Date64Array; using BuilderType = Date64Builder; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int64_t); } constexpr static bool is_parameter_free = true; @@ -155,7 +155,7 @@ struct TypeTraits { using ArrayType = Date32Array; using BuilderType = Date32Builder; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int32_t); } constexpr static bool is_parameter_free = true; @@ -167,7 +167,7 @@ struct TypeTraits { using ArrayType = TimestampArray; using BuilderType = TimestampBuilder; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int64_t); } constexpr static bool is_parameter_free = false; @@ -178,7 +178,7 @@ struct TypeTraits { using ArrayType = Time32Array; using BuilderType = Time32Builder; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int32_t); } constexpr static bool is_parameter_free = false; @@ -189,7 +189,7 @@ struct TypeTraits { using ArrayType = Time64Array; using BuilderType = Time64Builder; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(int64_t); } constexpr static bool is_parameter_free = false; @@ -201,7 +201,7 @@ struct TypeTraits { using BuilderType = HalfFloatBuilder; using TensorType = HalfFloatTensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return elements * sizeof(uint16_t); } constexpr static bool is_parameter_free = true; @@ -214,7 +214,7 @@ struct TypeTraits { using BuilderType = FloatBuilder; using TensorType = FloatTensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return static_cast(elements * sizeof(float)); } constexpr static bool is_parameter_free = true; @@ -227,7 +227,7 @@ struct TypeTraits { using BuilderType = DoubleBuilder; using TensorType = DoubleTensor; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return static_cast(elements * sizeof(double)); } constexpr static bool is_parameter_free = true; @@ -246,7 +246,7 @@ struct TypeTraits { using ArrayType = BooleanArray; using BuilderType = BooleanBuilder; - static inline int64_t bytes_required(int64_t elements) { + static constexpr int64_t bytes_required(int64_t elements) { return BitUtil::BytesForBits(elements); } constexpr static bool is_parameter_free = true; @@ -371,6 +371,11 @@ template using enable_if_boolean = typename std::enable_if::value>::type; +template +using enable_if_binary_like = + typename std::enable_if::value || + std::is_base_of::value>::type; + template using enable_if_fixed_size_binary = typename std::enable_if::value>::type; @@ -401,8 +406,8 @@ struct as_void { using type = typename T::ATTR_NAME; \ }; -GET_ATTR(c_type, void); -GET_ATTR(TypeClass, void); +GET_ATTR(c_type, void) +GET_ATTR(TypeClass, void) #undef GET_ATTR diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 6b9c3590b44dc..fefc8d6da8098 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -20,45 +20,7 @@ ####################################### # Headers: top level -install(FILES - bit-stream-utils.h - bit-util.h - bpacking.h - checked_cast.h - compiler-util.h - compression.h - compression_brotli.h - compression_bz2.h - compression_lz4.h - compression_snappy.h - compression_zlib.h - compression_zstd.h - cpu-info.h - date.h - decimal.h - hash-util.h - hashing.h - io-util.h - key_value_metadata.h - lazy.h - logging.h - macros.h - memory.h - neon-util.h - parallel.h - rle-encoding.h - sse-util.h - stl.h - stopwatch.h - string.h - string_view.h - thread-pool.h - type_traits.h - utf8.h - variant.h - visibility.h - windows_compatibility.h - DESTINATION include/arrow/util) +ARROW_INSTALL_ALL_HEADERS("arrow/util") ####################################### # arrow_test_main @@ -68,22 +30,22 @@ if (ARROW_BUILD_BENCHMARKS) add_library(arrow_benchmark_main benchmark_main.cc) if (APPLE) target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static ) elseif(MSVC) target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static Shlwapi.lib ) else() target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static pthread ) endif() # TODO(wesm): Some benchmarks include gtest.h - add_dependencies(arrow_benchmark_main gtest_static) + add_dependencies(arrow_benchmark_main ${GTEST_LIBRARY}) endif() ADD_ARROW_TEST(bit-util-test) @@ -100,6 +62,7 @@ ADD_ARROW_TEST(rle-encoding-test) ADD_ARROW_TEST(stl-util-test) ADD_ARROW_TEST(task-group-test) ADD_ARROW_TEST(thread-pool-test) +ADD_ARROW_TEST(trie-test) ADD_ARROW_TEST(utf8-util-test) ADD_ARROW_BENCHMARK(bit-util-benchmark) @@ -108,8 +71,8 @@ ADD_ARROW_BENCHMARK(decimal-benchmark) ADD_ARROW_BENCHMARK(hashing-benchmark) ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) +ADD_ARROW_BENCHMARK(machine-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) +ADD_ARROW_BENCHMARK(thread-pool-benchmark) +ADD_ARROW_BENCHMARK(trie-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) - -add_subdirectory(string_view) -add_subdirectory(variant) diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc new file mode 100644 index 0000000000000..bb235f4016619 --- /dev/null +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -0,0 +1,690 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/basic_decimal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace arrow { + +using internal::SafeLeftShift; +using internal::SafeSignedAdd; + +static const BasicDecimal128 ScaleMultipliers[] = { + BasicDecimal128(1LL), + BasicDecimal128(10LL), + BasicDecimal128(100LL), + BasicDecimal128(1000LL), + BasicDecimal128(10000LL), + BasicDecimal128(100000LL), + BasicDecimal128(1000000LL), + BasicDecimal128(10000000LL), + BasicDecimal128(100000000LL), + BasicDecimal128(1000000000LL), + BasicDecimal128(10000000000LL), + BasicDecimal128(100000000000LL), + BasicDecimal128(1000000000000LL), + BasicDecimal128(10000000000000LL), + BasicDecimal128(100000000000000LL), + BasicDecimal128(1000000000000000LL), + BasicDecimal128(10000000000000000LL), + BasicDecimal128(100000000000000000LL), + BasicDecimal128(1000000000000000000LL), + BasicDecimal128(0LL, 10000000000000000000ULL), + BasicDecimal128(5LL, 7766279631452241920ULL), + BasicDecimal128(54LL, 3875820019684212736ULL), + BasicDecimal128(542LL, 1864712049423024128ULL), + BasicDecimal128(5421LL, 200376420520689664ULL), + BasicDecimal128(54210LL, 2003764205206896640ULL), + BasicDecimal128(542101LL, 1590897978359414784ULL), + BasicDecimal128(5421010LL, 15908979783594147840ULL), + BasicDecimal128(54210108LL, 11515845246265065472ULL), + BasicDecimal128(542101086LL, 4477988020393345024ULL), + BasicDecimal128(5421010862LL, 7886392056514347008ULL), + BasicDecimal128(54210108624LL, 5076944270305263616ULL), + BasicDecimal128(542101086242LL, 13875954555633532928ULL), + BasicDecimal128(5421010862427LL, 9632337040368467968ULL), + BasicDecimal128(54210108624275LL, 4089650035136921600ULL), + BasicDecimal128(542101086242752LL, 4003012203950112768ULL), + BasicDecimal128(5421010862427522LL, 3136633892082024448ULL), + BasicDecimal128(54210108624275221LL, 12919594847110692864ULL), + BasicDecimal128(542101086242752217LL, 68739955140067328ULL), + BasicDecimal128(5421010862427522170LL, 687399551400673280ULL)}; + +static const BasicDecimal128 ScaleMultipliersHalf[] = { + BasicDecimal128(0ULL), + BasicDecimal128(5ULL), + BasicDecimal128(50ULL), + BasicDecimal128(500ULL), + BasicDecimal128(5000ULL), + BasicDecimal128(50000ULL), + BasicDecimal128(500000ULL), + BasicDecimal128(5000000ULL), + BasicDecimal128(50000000ULL), + BasicDecimal128(500000000ULL), + BasicDecimal128(5000000000ULL), + BasicDecimal128(50000000000ULL), + BasicDecimal128(500000000000ULL), + BasicDecimal128(5000000000000ULL), + BasicDecimal128(50000000000000ULL), + BasicDecimal128(500000000000000ULL), + BasicDecimal128(5000000000000000ULL), + BasicDecimal128(50000000000000000ULL), + BasicDecimal128(500000000000000000ULL), + BasicDecimal128(5000000000000000000ULL), + BasicDecimal128(2LL, 13106511852580896768ULL), + BasicDecimal128(27LL, 1937910009842106368ULL), + BasicDecimal128(271LL, 932356024711512064ULL), + BasicDecimal128(2710LL, 9323560247115120640ULL), + BasicDecimal128(27105LL, 1001882102603448320ULL), + BasicDecimal128(271050LL, 10018821026034483200ULL), + BasicDecimal128(2710505LL, 7954489891797073920ULL), + BasicDecimal128(27105054LL, 5757922623132532736ULL), + BasicDecimal128(271050543LL, 2238994010196672512ULL), + BasicDecimal128(2710505431LL, 3943196028257173504ULL), + BasicDecimal128(27105054312LL, 2538472135152631808ULL), + BasicDecimal128(271050543121LL, 6937977277816766464ULL), + BasicDecimal128(2710505431213LL, 14039540557039009792ULL), + BasicDecimal128(27105054312137LL, 11268197054423236608ULL), + BasicDecimal128(271050543121376LL, 2001506101975056384ULL), + BasicDecimal128(2710505431213761LL, 1568316946041012224ULL), + BasicDecimal128(27105054312137610LL, 15683169460410122240ULL), + BasicDecimal128(271050543121376108LL, 9257742014424809472ULL), + BasicDecimal128(2710505431213761085LL, 343699775700336640ULL)}; + +static constexpr uint64_t kIntMask = 0xFFFFFFFF; +static constexpr auto kCarryBit = static_cast(1) << static_cast(32); + +BasicDecimal128::BasicDecimal128(const uint8_t* bytes) + : BasicDecimal128( + BitUtil::FromLittleEndian(reinterpret_cast(bytes)[1]), + BitUtil::FromLittleEndian(reinterpret_cast(bytes)[0])) {} + +std::array BasicDecimal128::ToBytes() const { + std::array out{{0}}; + ToBytes(out.data()); + return out; +} + +void BasicDecimal128::ToBytes(uint8_t* out) const { + DCHECK_NE(out, nullptr); + reinterpret_cast(out)[0] = BitUtil::ToLittleEndian(low_bits_); + reinterpret_cast(out)[1] = BitUtil::ToLittleEndian(high_bits_); +} + +BasicDecimal128& BasicDecimal128::Negate() { + low_bits_ = ~low_bits_ + 1; + high_bits_ = ~high_bits_; + if (low_bits_ == 0) { + high_bits_ = SafeSignedAdd(high_bits_, 1); + } + return *this; +} + +BasicDecimal128& BasicDecimal128::Abs() { return *this < 0 ? Negate() : *this; } + +BasicDecimal128& BasicDecimal128::operator+=(const BasicDecimal128& right) { + const uint64_t sum = low_bits_ + right.low_bits_; + high_bits_ = SafeSignedAdd(high_bits_, right.high_bits_); + if (sum < low_bits_) { + high_bits_ = SafeSignedAdd(high_bits_, 1); + } + low_bits_ = sum; + return *this; +} + +BasicDecimal128& BasicDecimal128::operator-=(const BasicDecimal128& right) { + const uint64_t diff = low_bits_ - right.low_bits_; + high_bits_ -= right.high_bits_; + if (diff > low_bits_) { + --high_bits_; + } + low_bits_ = diff; + return *this; +} + +BasicDecimal128& BasicDecimal128::operator/=(const BasicDecimal128& right) { + BasicDecimal128 remainder; + auto s = Divide(right, this, &remainder); + DCHECK_EQ(s, DecimalStatus::kSuccess); + return *this; +} + +BasicDecimal128& BasicDecimal128::operator|=(const BasicDecimal128& right) { + low_bits_ |= right.low_bits_; + high_bits_ |= right.high_bits_; + return *this; +} + +BasicDecimal128& BasicDecimal128::operator&=(const BasicDecimal128& right) { + low_bits_ &= right.low_bits_; + high_bits_ &= right.high_bits_; + return *this; +} + +BasicDecimal128& BasicDecimal128::operator<<=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + high_bits_ = SafeLeftShift(high_bits_, bits); + high_bits_ |= (low_bits_ >> (64 - bits)); + low_bits_ <<= bits; + } else if (bits < 128) { + high_bits_ = static_cast(low_bits_) << (bits - 64); + low_bits_ = 0; + } else { + high_bits_ = 0; + low_bits_ = 0; + } + } + return *this; +} + +BasicDecimal128& BasicDecimal128::operator>>=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + low_bits_ >>= bits; + low_bits_ |= static_cast(high_bits_ << (64 - bits)); + high_bits_ = static_cast(static_cast(high_bits_) >> bits); + } else if (bits < 128) { + low_bits_ = static_cast(high_bits_ >> (bits - 64)); + high_bits_ = static_cast(high_bits_ >= 0L ? 0L : -1L); + } else { + high_bits_ = static_cast(high_bits_ >= 0L ? 0L : -1L); + low_bits_ = static_cast(high_bits_); + } + } + return *this; +} + +BasicDecimal128& BasicDecimal128::operator*=(const BasicDecimal128& right) { + // Break the left and right numbers into 32 bit chunks + // so that we can multiply them without overflow. + const uint64_t L0 = static_cast(high_bits_) >> 32; + const uint64_t L1 = static_cast(high_bits_) & kIntMask; + const uint64_t L2 = low_bits_ >> 32; + const uint64_t L3 = low_bits_ & kIntMask; + + const uint64_t R0 = static_cast(right.high_bits_) >> 32; + const uint64_t R1 = static_cast(right.high_bits_) & kIntMask; + const uint64_t R2 = right.low_bits_ >> 32; + const uint64_t R3 = right.low_bits_ & kIntMask; + + uint64_t product = L3 * R3; + low_bits_ = product & kIntMask; + + uint64_t sum = product >> 32; + + product = L2 * R3; + sum += product; + + product = L3 * R2; + sum += product; + + low_bits_ += sum << 32; + + high_bits_ = static_cast(sum < product ? kCarryBit : 0); + if (sum < product) { + high_bits_ += kCarryBit; + } + + high_bits_ += static_cast(sum >> 32); + high_bits_ += L1 * R3 + L2 * R2 + L3 * R1; + high_bits_ += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; + return *this; +} + +/// Expands the given value into an array of ints so that we can work on +/// it. The array will be converted to an absolute value and the wasNegative +/// flag will be set appropriately. The array will remove leading zeros from +/// the value. +/// \param array an array of length 4 to set with the value +/// \param was_negative a flag for whether the value was original negative +/// \result the output length of the array +static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array, + bool& was_negative) { + uint64_t high; + uint64_t low; + const int64_t highbits = value.high_bits(); + const uint64_t lowbits = value.low_bits(); + + if (highbits < 0) { + low = ~lowbits + 1; + high = static_cast(~highbits); + if (low == 0) { + ++high; + } + was_negative = true; + } else { + low = lowbits; + high = static_cast(highbits); + was_negative = false; + } + + if (high != 0) { + if (high > std::numeric_limits::max()) { + array[0] = static_cast(high >> 32); + array[1] = static_cast(high); + array[2] = static_cast(low >> 32); + array[3] = static_cast(low); + return 4; + } + + array[0] = static_cast(high); + array[1] = static_cast(low >> 32); + array[2] = static_cast(low); + return 3; + } + + if (low >= std::numeric_limits::max()) { + array[0] = static_cast(low >> 32); + array[1] = static_cast(low); + return 2; + } + + if (low == 0) { + return 0; + } + + array[0] = static_cast(low); + return 1; +} + +/// Shift the number in the array left by bits positions. +/// \param array the number to shift, must have length elements +/// \param length the number of entries in the array +/// \param bits the number of bits to shift (0 <= bits < 32) +static void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits != 0) { + for (int64_t i = 0; i < length - 1; ++i) { + array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits)); + } + array[length - 1] <<= bits; + } +} + +/// Shift the number in the array right by bits positions. +/// \param array the number to shift, must have length elements +/// \param length the number of entries in the array +/// \param bits the number of bits to shift (0 <= bits < 32) +static void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits != 0) { + for (int64_t i = length - 1; i > 0; --i) { + array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits)); + } + array[0] >>= bits; + } +} + +/// \brief Fix the signs of the result and remainder at the end of the division based on +/// the signs of the dividend and divisor. +static void FixDivisionSigns(BasicDecimal128* result, BasicDecimal128* remainder, + bool dividend_was_negative, bool divisor_was_negative) { + if (dividend_was_negative != divisor_was_negative) { + result->Negate(); + } + + if (dividend_was_negative) { + remainder->Negate(); + } +} + +/// \brief Build a BasicDecimal128 from a list of ints. +static DecimalStatus BuildFromArray(BasicDecimal128* value, uint32_t* array, + int64_t length) { + switch (length) { + case 0: + *value = {static_cast(0)}; + break; + case 1: + *value = {static_cast(array[0])}; + break; + case 2: + *value = {static_cast(0), + (static_cast(array[0]) << 32) + array[1]}; + break; + case 3: + *value = {static_cast(array[0]), + (static_cast(array[1]) << 32) + array[2]}; + break; + case 4: + *value = {(static_cast(array[0]) << 32) + array[1], + (static_cast(array[2]) << 32) + array[3]}; + break; + case 5: + if (array[0] != 0) { + return DecimalStatus::kOverflow; + } + *value = {(static_cast(array[1]) << 32) + array[2], + (static_cast(array[3]) << 32) + array[4]}; + break; + default: + return DecimalStatus::kOverflow; + } + + return DecimalStatus::kSuccess; +} + +/// \brief Do a division where the divisor fits into a single 32 bit value. +static DecimalStatus SingleDivide(const uint32_t* dividend, int64_t dividend_length, + uint32_t divisor, BasicDecimal128* remainder, + bool dividend_was_negative, bool divisor_was_negative, + BasicDecimal128* result) { + uint64_t r = 0; + uint32_t result_array[5]; + for (int64_t j = 0; j < dividend_length; j++) { + r <<= 32; + r += dividend[j]; + result_array[j] = static_cast(r / divisor); + r %= divisor; + } + auto status = BuildFromArray(result, result_array, dividend_length); + if (status != DecimalStatus::kSuccess) { + return status; + } + + *remainder = static_cast(r); + FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative); + return DecimalStatus::kSuccess; +} + +DecimalStatus BasicDecimal128::Divide(const BasicDecimal128& divisor, + BasicDecimal128* result, + BasicDecimal128* remainder) const { + // Split the dividend and divisor into integer pieces so that we can + // work on them. + uint32_t dividend_array[5]; + uint32_t divisor_array[4]; + bool dividend_was_negative; + bool divisor_was_negative; + // leave an extra zero before the dividend + dividend_array[0] = 0; + int64_t dividend_length = + FillInArray(*this, dividend_array + 1, dividend_was_negative) + 1; + int64_t divisor_length = FillInArray(divisor, divisor_array, divisor_was_negative); + + // Handle some of the easy cases. + if (dividend_length <= divisor_length) { + *remainder = *this; + *result = 0; + return DecimalStatus::kSuccess; + } + + if (divisor_length == 0) { + return DecimalStatus::kDivideByZero; + } + + if (divisor_length == 1) { + return SingleDivide(dividend_array, dividend_length, divisor_array[0], remainder, + dividend_was_negative, divisor_was_negative, result); + } + + int64_t result_length = dividend_length - divisor_length; + uint32_t result_array[4]; + + // Normalize by shifting both by a multiple of 2 so that + // the digit guessing is better. The requirement is that + // divisor_array[0] is greater than 2**31. + int64_t normalize_bits = BitUtil::CountLeadingZeros(divisor_array[0]); + ShiftArrayLeft(divisor_array, divisor_length, normalize_bits); + ShiftArrayLeft(dividend_array, dividend_length, normalize_bits); + + // compute each digit in the result + for (int64_t j = 0; j < result_length; ++j) { + // Guess the next digit. At worst it is two too large + uint32_t guess = std::numeric_limits::max(); + const auto high_dividend = + static_cast(dividend_array[j]) << 32 | dividend_array[j + 1]; + if (dividend_array[j] != divisor_array[0]) { + guess = static_cast(high_dividend / divisor_array[0]); + } + + // catch all of the cases where guess is two too large and most of the + // cases where it is one too large + auto rhat = static_cast(high_dividend - + guess * static_cast(divisor_array[0])); + while (static_cast(divisor_array[1]) * guess > + (static_cast(rhat) << 32) + dividend_array[j + 2]) { + --guess; + rhat += divisor_array[0]; + if (static_cast(rhat) < divisor_array[0]) { + break; + } + } + + // subtract off the guess * divisor from the dividend + uint64_t mult = 0; + for (int64_t i = divisor_length - 1; i >= 0; --i) { + mult += static_cast(guess) * divisor_array[i]; + uint32_t prev = dividend_array[j + i + 1]; + dividend_array[j + i + 1] -= static_cast(mult); + mult >>= 32; + if (dividend_array[j + i + 1] > prev) { + ++mult; + } + } + uint32_t prev = dividend_array[j]; + dividend_array[j] -= static_cast(mult); + + // if guess was too big, we add back divisor + if (dividend_array[j] > prev) { + --guess; + uint32_t carry = 0; + for (int64_t i = divisor_length - 1; i >= 0; --i) { + const auto sum = + static_cast(divisor_array[i]) + dividend_array[j + i + 1] + carry; + dividend_array[j + i + 1] = static_cast(sum); + carry = static_cast(sum >> 32); + } + dividend_array[j] += carry; + } + + result_array[j] = guess; + } + + // denormalize the remainder + ShiftArrayRight(dividend_array, dividend_length, normalize_bits); + + // return result and remainder + auto status = BuildFromArray(result, result_array, result_length); + if (status != DecimalStatus::kSuccess) { + return status; + } + status = BuildFromArray(remainder, dividend_array, dividend_length); + if (status != DecimalStatus::kSuccess) { + return status; + } + + FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative); + return DecimalStatus::kSuccess; +} + +bool operator==(const BasicDecimal128& left, const BasicDecimal128& right) { + return left.high_bits() == right.high_bits() && left.low_bits() == right.low_bits(); +} + +bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right) { + return !operator==(left, right); +} + +bool operator<(const BasicDecimal128& left, const BasicDecimal128& right) { + return left.high_bits() < right.high_bits() || + (left.high_bits() == right.high_bits() && left.low_bits() < right.low_bits()); +} + +bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right) { + return !operator>(left, right); +} + +bool operator>(const BasicDecimal128& left, const BasicDecimal128& right) { + return operator<(right, left); +} + +bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right) { + return !operator<(left, right); +} + +BasicDecimal128 operator-(const BasicDecimal128& operand) { + BasicDecimal128 result(operand.high_bits(), operand.low_bits()); + return result.Negate(); +} + +BasicDecimal128 operator~(const BasicDecimal128& operand) { + BasicDecimal128 result(~operand.high_bits(), ~operand.low_bits()); + return result; +} + +BasicDecimal128 operator+(const BasicDecimal128& left, const BasicDecimal128& right) { + BasicDecimal128 result(left.high_bits(), left.low_bits()); + result += right; + return result; +} + +BasicDecimal128 operator-(const BasicDecimal128& left, const BasicDecimal128& right) { + BasicDecimal128 result(left.high_bits(), left.low_bits()); + result -= right; + return result; +} + +BasicDecimal128 operator*(const BasicDecimal128& left, const BasicDecimal128& right) { + BasicDecimal128 result(left.high_bits(), left.low_bits()); + result *= right; + return result; +} + +BasicDecimal128 operator/(const BasicDecimal128& left, const BasicDecimal128& right) { + BasicDecimal128 remainder; + BasicDecimal128 result; + auto s = left.Divide(right, &result, &remainder); + DCHECK_EQ(s, DecimalStatus::kSuccess); + return result; +} + +BasicDecimal128 operator%(const BasicDecimal128& left, const BasicDecimal128& right) { + BasicDecimal128 remainder; + BasicDecimal128 result; + auto s = left.Divide(right, &result, &remainder); + DCHECK_EQ(s, DecimalStatus::kSuccess); + return remainder; +} + +static bool RescaleWouldCauseDataLoss(const BasicDecimal128& value, int32_t delta_scale, + int32_t abs_delta_scale, BasicDecimal128* result) { + BasicDecimal128 multiplier(ScaleMultipliers[abs_delta_scale]); + + if (delta_scale < 0) { + DCHECK_NE(multiplier, 0); + BasicDecimal128 remainder; + auto status = value.Divide(multiplier, result, &remainder); + DCHECK_EQ(status, DecimalStatus::kSuccess); + return remainder != 0; + } + + *result = value * multiplier; + return (value < 0) ? *result > value : *result < value; +} + +DecimalStatus BasicDecimal128::Rescale(int32_t original_scale, int32_t new_scale, + BasicDecimal128* out) const { + DCHECK_NE(out, nullptr); + DCHECK_NE(original_scale, new_scale); + + const int32_t delta_scale = new_scale - original_scale; + const int32_t abs_delta_scale = std::abs(delta_scale); + + DCHECK_GE(abs_delta_scale, 1); + DCHECK_LE(abs_delta_scale, 38); + + BasicDecimal128 result(*this); + const bool rescale_would_cause_data_loss = + RescaleWouldCauseDataLoss(result, delta_scale, abs_delta_scale, out); + + // Fail if we overflow or truncate + if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) { + return DecimalStatus::kRescaleDataLoss; + } + + return DecimalStatus::kSuccess; +} + +void BasicDecimal128::GetWholeAndFraction(int scale, BasicDecimal128* whole, + BasicDecimal128* fraction) const { + DCHECK_GE(scale, 0); + DCHECK_LE(scale, 38); + + BasicDecimal128 multiplier(ScaleMultipliers[scale]); + DCHECK_EQ(Divide(multiplier, whole, fraction), DecimalStatus::kSuccess); +} + +const BasicDecimal128& BasicDecimal128::GetScaleMultiplier(int32_t scale) { + DCHECK_GE(scale, 0); + DCHECK_LE(scale, 38); + + return ScaleMultipliers[scale]; +} + +BasicDecimal128 BasicDecimal128::IncreaseScaleBy(int32_t increase_by) const { + DCHECK_GE(increase_by, 0); + DCHECK_LE(increase_by, 38); + + return (*this) * ScaleMultipliers[increase_by]; +} + +BasicDecimal128 BasicDecimal128::ReduceScaleBy(int32_t reduce_by, bool round) const { + DCHECK_GE(reduce_by, 0); + DCHECK_LE(reduce_by, 38); + + BasicDecimal128 divisor(ScaleMultipliers[reduce_by]); + BasicDecimal128 result; + BasicDecimal128 remainder; + DCHECK_EQ(Divide(divisor, &result, &remainder), DecimalStatus::kSuccess); + if (round) { + auto divisor_half = ScaleMultipliersHalf[reduce_by]; + if (remainder.Abs() >= divisor_half) { + if (result > 0) { + result += 1; + } else { + result -= 1; + } + } + } + return result; +} + +int32_t BasicDecimal128::CountLeadingBinaryZeros() const { + DCHECK_GE(*this, BasicDecimal128(0)); + + if (high_bits_ == 0) { + return BitUtil::CountLeadingZeros(low_bits_) + 64; + } else { + return BitUtil::CountLeadingZeros(static_cast(high_bits_)); + } +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h new file mode 100644 index 0000000000000..e19cb14a00ffb --- /dev/null +++ b/cpp/src/arrow/util/basic_decimal.h @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +enum class DecimalStatus { + kSuccess, + kDivideByZero, + kOverflow, + kRescaleDataLoss, +}; + +/// Represents a signed 128-bit integer in two's complement. +/// +/// This class is also compiled into LLVM IR - so, it should not have cpp references like +/// streams and boost. +class ARROW_EXPORT BasicDecimal128 { + public: + /// \brief Create a BasicDecimal128 from the two's complement representation. + constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept + : low_bits_(low), high_bits_(high) {} + + /// \brief Empty constructor creates a BasicDecimal128 with a value of 0. + constexpr BasicDecimal128() noexcept : BasicDecimal128(0, 0) {} + + /// \brief Convert any integer value into a BasicDecimal128. + template ::value, T>::type> + constexpr BasicDecimal128(T value) noexcept + : BasicDecimal128(static_cast(value) >= 0 ? 0 : -1, + static_cast(value)) {} + + /// \brief Create a BasicDecimal128 from an array of bytes. Bytes are assumed to be in + /// little-endian byte order. + explicit BasicDecimal128(const uint8_t* bytes); + + /// \brief Negate the current value (in-place) + BasicDecimal128& Negate(); + + /// \brief Absolute value (in-place) + BasicDecimal128& Abs(); + + /// \brief Add a number to this one. The result is truncated to 128 bits. + BasicDecimal128& operator+=(const BasicDecimal128& right); + + /// \brief Subtract a number from this one. The result is truncated to 128 bits. + BasicDecimal128& operator-=(const BasicDecimal128& right); + + /// \brief Multiply this number by another number. The result is truncated to 128 bits. + BasicDecimal128& operator*=(const BasicDecimal128& right); + + /// Divide this number by right and return the result. + /// + /// This operation is not destructive. + /// The answer rounds to zero. Signs work like: + /// 21 / 5 -> 4, 1 + /// -21 / 5 -> -4, -1 + /// 21 / -5 -> -4, 1 + /// -21 / -5 -> 4, -1 + /// \param[in] divisor the number to divide by + /// \param[out] result the quotient + /// \param[out] remainder the remainder after the division + DecimalStatus Divide(const BasicDecimal128& divisor, BasicDecimal128* result, + BasicDecimal128* remainder) const; + + /// \brief In-place division. + BasicDecimal128& operator/=(const BasicDecimal128& right); + + /// \brief Bitwise "or" between two BasicDecimal128. + BasicDecimal128& operator|=(const BasicDecimal128& right); + + /// \brief Bitwise "and" between two BasicDecimal128. + BasicDecimal128& operator&=(const BasicDecimal128& right); + + /// \brief Shift left by the given number of bits. + BasicDecimal128& operator<<=(uint32_t bits); + + /// \brief Shift right by the given number of bits. Negative values will + BasicDecimal128& operator>>=(uint32_t bits); + + /// \brief Get the high bits of the two's complement representation of the number. + inline int64_t high_bits() const { return high_bits_; } + + /// \brief Get the low bits of the two's complement representation of the number. + inline uint64_t low_bits() const { return low_bits_; } + + /// \brief Return the raw bytes of the value in little-endian byte order. + std::array ToBytes() const; + void ToBytes(uint8_t* out) const; + + /// \brief seperate the integer and fractional parts for the given scale. + void GetWholeAndFraction(int32_t scale, BasicDecimal128* whole, + BasicDecimal128* fraction) const; + + /// \brief Scale multiplier for given scale value. + static const BasicDecimal128& GetScaleMultiplier(int32_t scale); + + /// \brief Convert BasicDecimal128 from one scale to another + DecimalStatus Rescale(int32_t original_scale, int32_t new_scale, + BasicDecimal128* out) const; + + /// \brief Scale up. + BasicDecimal128 IncreaseScaleBy(int32_t increase_by) const; + + /// \brief Scale down. + /// - If 'round' is true, the right-most digits are dropped and the result value is + /// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits + /// (>= 10^reduce_by / 2). + /// - If 'round' is false, the right-most digits are simply dropped. + BasicDecimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const; + + /// \brief count the number of leading binary zeroes. + int32_t CountLeadingBinaryZeros() const; + + private: + uint64_t low_bits_; + int64_t high_bits_; +}; + +ARROW_EXPORT bool operator==(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right); + +ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& operand); +ARROW_EXPORT BasicDecimal128 operator~(const BasicDecimal128& operand); +ARROW_EXPORT BasicDecimal128 operator+(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator*(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left, + const BasicDecimal128& right); + +} // namespace arrow diff --git a/cpp/src/arrow/util/bit-stream-utils.h b/cpp/src/arrow/util/bit-stream-utils.h index ff215e488b4a3..ad86ee87c9fda 100644 --- a/cpp/src/arrow/util/bit-stream-utils.h +++ b/cpp/src/arrow/util/bit-stream-utils.h @@ -110,7 +110,12 @@ class BitReader { memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); } - BitReader() : buffer_(NULL), max_bytes_(0) {} + BitReader() + : buffer_(NULL), + max_bytes_(0), + buffered_values_(0), + byte_offset_(0), + bit_offset_(0) {} void Reset(const uint8_t* buffer, int buffer_len) { buffer_ = buffer; @@ -392,7 +397,8 @@ inline bool BitReader::GetVlqInt(int32_t* v) { } inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u = (v << 1) ^ (v >> 31); + // Note negative left shift is undefined + uint32_t u = (static_cast(v) << 1) ^ (v >> 31); return PutVlqInt(u); } diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc index cc71078880156..00093a2cf7b59 100644 --- a/cpp/src/arrow/util/bit-util-benchmark.cc +++ b/cpp/src/arrow/util/bit-util-benchmark.cc @@ -61,13 +61,15 @@ class NaiveBitmapWriter { void Set() { const int64_t byte_offset = position_ / 8; const int64_t bit_offset = position_ % 8; - bitmap_[byte_offset] |= static_cast(1 << bit_offset); + auto bit_set_mask = (1U << bit_offset); + bitmap_[byte_offset] = static_cast(bitmap_[byte_offset] | bit_set_mask); } void Clear() { const int64_t byte_offset = position_ / 8; const int64_t bit_offset = position_ % 8; - bitmap_[byte_offset] &= 0xFF ^ static_cast(1 << bit_offset); + auto bit_clear_mask = 0xFFU ^ (1U << bit_offset); + bitmap_[byte_offset] = static_cast(bitmap_[byte_offset] & bit_clear_mask); } void Next() { ++position_; } diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 5f181e9b7b14c..6709ae4a7d853 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -167,33 +166,40 @@ TEST(BitmapReader, DoesNotReadOutOfBounds) { } TEST(BitmapWriter, NormalOperation) { - { - uint8_t bitmap[] = {0, 0, 0, 0}; - auto writer = internal::BitmapWriter(bitmap, 0, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0b00110110, 0b1010, 0, 0} - ASSERT_BYTES_EQ(bitmap, {0x36, 0x0a, 0, 0}); - } - { - uint8_t bitmap[] = {0xff, 0xff, 0xff, 0xff}; - auto writer = internal::BitmapWriter(bitmap, 0, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0b00110110, 0b11111010, 0xff, 0xff} - ASSERT_BYTES_EQ(bitmap, {0x36, 0xfa, 0xff, 0xff}); - } - { - uint8_t bitmap[] = {0, 0, 0, 0}; - auto writer = internal::BitmapWriter(bitmap, 3, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0b10110000, 0b01010001, 0, 0} - ASSERT_BYTES_EQ(bitmap, {0xb0, 0x51, 0, 0}); - } - { - uint8_t bitmap[] = {0, 0, 0, 0}; - auto writer = internal::BitmapWriter(bitmap, 20, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0, 0, 0b01100000, 0b10100011} - ASSERT_BYTES_EQ(bitmap, {0, 0, 0x60, 0xa3}); + for (const auto fill_byte_int : {0x00, 0xff}) { + const uint8_t fill_byte = static_cast(fill_byte_int); + { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, 0, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b00110110, 0b....1010, ........, ........} + ASSERT_BYTES_EQ(bitmap, {0x36, static_cast(0x0a | (fill_byte & 0xf0)), + fill_byte, fill_byte}); + } + { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, 3, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b10110..., 0b.1010001, ........, ........} + ASSERT_BYTES_EQ(bitmap, {static_cast(0xb0 | (fill_byte & 0x07)), + static_cast(0x51 | (fill_byte & 0x80)), fill_byte, + fill_byte}); + } + { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, 20, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {........, ........, 0b0110...., 0b10100011} + ASSERT_BYTES_EQ(bitmap, {fill_byte, fill_byte, + static_cast(0x60 | (fill_byte & 0x0f)), 0xa3}); + } + // 0-length writes + for (int64_t pos = 0; pos < 32; ++pos) { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, pos, 0); + WriteVectorToWriter(writer, {}); + ASSERT_BYTES_EQ(bitmap, {fill_byte, fill_byte, fill_byte, fill_byte}); + } } } @@ -266,6 +272,10 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } { uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + { + auto writer = internal::FirstTimeBitmapWriter(bitmap, 4, 0); + WriteVectorToWriter(writer, {}); + } { auto writer = internal::FirstTimeBitmapWriter(bitmap, 4, 6); WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1}); @@ -274,6 +284,10 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { auto writer = internal::FirstTimeBitmapWriter(bitmap, 10, 3); WriteVectorToWriter(writer, {0, 0, 0}); } + { + auto writer = internal::FirstTimeBitmapWriter(bitmap, 13, 0); + WriteVectorToWriter(writer, {}); + } { auto writer = internal::FirstTimeBitmapWriter(bitmap, 13, 3); WriteVectorToWriter(writer, {1, 0, 1}); @@ -319,8 +333,8 @@ TYPED_TEST(TestGenerateBits, NormalOperation) { for (const int64_t start_offset : start_offsets) { for (const int64_t length : lengths) { for (const uint8_t fill_byte : fill_bytes) { - uint8_t bitmap[kSourceSize]; - memset(bitmap, fill_byte, kSourceSize); + uint8_t bitmap[kSourceSize + 1]; + memset(bitmap, fill_byte, kSourceSize + 1); // First call GenerateBits { int64_t ncalled = 0; @@ -344,7 +358,7 @@ TYPED_TEST(TestGenerateBits, NormalOperation) { result_reader.Next(); } } - // Check bits preceding and following generated contents weren't clobbered + // Check bits preceding generated contents weren't clobbered { internal::BitmapReader reader_before(bitmap, 0, start_offset); for (int64_t i = 0; i < start_offset; ++i) { @@ -352,6 +366,9 @@ TYPED_TEST(TestGenerateBits, NormalOperation) { << "mismatch at preceding bit #" << start_offset - i; } } + // Check the byte following generated contents wasn't clobbered + auto byte_after = bitmap[BitUtil::CeilDiv(start_offset + length, 8)]; + ASSERT_EQ(byte_after, fill_byte); } } } @@ -499,6 +516,43 @@ TEST(BitUtilTests, TestCountSetBits) { } } +TEST(BitUtilTests, TestSetBitsTo) { + using BitUtil::SetBitsTo; + for (const auto fill_byte_int : {0x00, 0xff}) { + const uint8_t fill_byte = static_cast(fill_byte_int); + { + // test set within a byte + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + SetBitsTo(bitmap, 2, 2, true); + SetBitsTo(bitmap, 4, 2, false); + ASSERT_BYTES_EQ(bitmap, {static_cast((fill_byte & ~0x3C) | 0xC)}); + } + { + // test straddling a single byte boundary + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + SetBitsTo(bitmap, 4, 7, true); + SetBitsTo(bitmap, 11, 7, false); + ASSERT_BYTES_EQ(bitmap, {static_cast((fill_byte & 0xF) | 0xF0), 0x7, + static_cast(fill_byte & ~0x3)}); + } + { + // test byte aligned end + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + SetBitsTo(bitmap, 4, 4, true); + SetBitsTo(bitmap, 8, 8, false); + ASSERT_BYTES_EQ(bitmap, + {static_cast((fill_byte & 0xF) | 0xF0), 0x00, fill_byte}); + } + { + // test byte aligned end, multiple bytes + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + SetBitsTo(bitmap, 0, 24, false); + uint8_t false_byte = static_cast(0); + ASSERT_BYTES_EQ(bitmap, {false_byte, false_byte, false_byte, fill_byte}); + } + } +} + TEST(BitUtilTests, TestCopyBitmap) { const int kBufferSize = 1000; @@ -756,7 +810,9 @@ static void TestZigZag(int32_t v) { TEST(BitStreamUtil, ZigZag) { TestZigZag(0); TestZigZag(1); + TestZigZag(1234); TestZigZag(-1); + TestZigZag(-1234); TestZigZag(std::numeric_limits::max()); TestZigZag(-std::numeric_limits::max()); } diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 7b7a7261fad13..862b1fd050bc5 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -196,8 +196,8 @@ Status TransferBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, TransferBitmap(data, offset, length, 0, dest); - // As we have freshly allocated this bitmap, we should take care of zeroing the remaing - // bits. + // As we have freshly allocated this bitmap, we should take care of zeroing the + // remaining bits. int64_t num_bytes = BitUtil::BytesForBits(length); int64_t bits_to_zero = num_bytes * 8 - length; for (int64_t i = length; i < length + bits_to_zero; ++i) { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 93b6cb28d91b1..8e6979ff24b63 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -53,6 +53,7 @@ #endif #include +#include #include #include #include @@ -84,11 +85,11 @@ namespace BitUtil { // // Returns the ceil of value/divisor -static inline int64_t CeilDiv(int64_t value, int64_t divisor) { +constexpr int64_t CeilDiv(int64_t value, int64_t divisor) { return value / divisor + (value % divisor != 0); } -static inline int64_t BytesForBits(int64_t bits) { return (bits + 7) >> 3; } +constexpr int64_t BytesForBits(int64_t bits) { return (bits + 7) >> 3; } // Returns the smallest power of two that contains v. If v is already a // power of two, it is returned as is. @@ -106,12 +107,12 @@ static inline int64_t NextPower2(int64_t n) { return n; } -static inline bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } +constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } -static inline bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } +constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } // Returns 'value' rounded up to the nearest multiple of 'factor' -static inline int64_t RoundUp(int64_t value, int64_t factor) { +constexpr int64_t RoundUp(int64_t value, int64_t factor) { return (value + (factor - 1)) / factor * factor; } @@ -119,16 +120,14 @@ static inline int64_t RoundUp(int64_t value, int64_t factor) { // is a power of two. // The result is undefined on overflow, i.e. if `value > 2**64 - factor`, // since we cannot return the correct result which would be 2**64. -static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) { +constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) { // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); return (value + (factor - 1)) & ~(factor - 1); } -static inline int64_t RoundUpToMultipleOf8(int64_t num) { - return RoundUpToPowerOf2(num, 8); -} +constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return RoundUpToPowerOf2(num, 8); } -static inline int64_t RoundUpToMultipleOf64(int64_t num) { +constexpr int64_t RoundUpToMultipleOf64(int64_t num) { return RoundUpToPowerOf2(num, 64); } @@ -329,6 +328,48 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) { kBitmask[i % 8]; } +/// \brief set or clear a range of bits quickly +static inline void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, + bool bits_are_set) { + if (length == 0) return; + + const auto i_begin = start_offset; + const auto i_end = start_offset + length; + const uint8_t fill_byte = static_cast(-static_cast(bits_are_set)); + + const auto bytes_begin = i_begin / 8; + const auto bytes_end = i_end / 8 + 1; + + const auto first_byte_mask = kPrecedingBitmask[i_begin % 8]; + const auto last_byte_mask = kTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const auto only_byte_mask = + i_end % 8 == 0 ? first_byte_mask + : static_cast(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= static_cast(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= static_cast(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + std::memset(bits + bytes_begin + 1, fill_byte, + static_cast(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) return; + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= static_cast(fill_byte & ~last_byte_mask); +} + /// \brief Convert vector of bytes to bitmap buffer ARROW_EXPORT Status BytesToBits(const std::vector&, MemoryPool*, std::shared_ptr*); @@ -409,7 +450,7 @@ class BitmapWriter { void Finish() { // Store current byte if we didn't went past bitmap storage - if (bit_mask_ != 0x01 || position_ < length_) { + if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { bitmap_[byte_offset_] = current_byte_; } } @@ -461,7 +502,7 @@ class FirstTimeBitmapWriter { void Finish() { // Store current byte if we didn't went past bitmap storage - if (bit_mask_ != 0x01 || position_ < length_) { + if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { bitmap_[byte_offset_] = current_byte_; } } @@ -578,8 +619,8 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, int64 /// \param[in] offset bit offset into the source data /// \param[in] length number of bits to copy /// \param[in] dest_offset bit offset into the destination -/// \param[out] dest the destination buffer, must have at least space for (offset + -/// length) bits +/// \param[out] dest the destination buffer, must have at least space for +/// (offset + length) bits ARROW_EXPORT void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, int64_t dest_offset); @@ -590,8 +631,8 @@ void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* /// \param[in] offset bit offset into the source data /// \param[in] length number of bits to copy /// \param[in] dest_offset bit offset into the destination -/// \param[out] dest the destination buffer, must have at least space for (offset + -/// length) bits +/// \param[out] dest the destination buffer, must have at least space for +/// (offset + length) bits ARROW_EXPORT void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, int64_t dest_offset); @@ -613,7 +654,8 @@ Status InvertBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, /// /// \param[in] data a packed LSB-ordered bitmap as a byte array /// \param[in] bit_offset a bitwise offset into the bitmap -/// \param[in] length the number of bits to inspect in the bitmap relative to the offset +/// \param[in] length the number of bits to inspect in the bitmap relative to +/// the offset /// /// \return The number of set (1) bits in the range ARROW_EXPORT diff --git a/cpp/src/arrow/util/compression-test.cc b/cpp/src/arrow/util/compression-test.cc index e0e6f4837f201..22bec001bfd45 100644 --- a/cpp/src/arrow/util/compression-test.cc +++ b/cpp/src/arrow/util/compression-test.cc @@ -448,17 +448,22 @@ TEST_P(CodecTest, StreamingRoundtrip) { INSTANTIATE_TEST_CASE_P(TestGZip, CodecTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTD, CodecTest, ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestSnappy, CodecTest, ::testing::Values(Compression::SNAPPY)); INSTANTIATE_TEST_CASE_P(TestLZ4, CodecTest, ::testing::Values(Compression::LZ4)); INSTANTIATE_TEST_CASE_P(TestBrotli, CodecTest, ::testing::Values(Compression::BROTLI)); +// bz2 requires a binary installation, there is no ExternalProject #if ARROW_WITH_BZ2 INSTANTIATE_TEST_CASE_P(TestBZ2, CodecTest, ::testing::Values(Compression::BZ2)); #endif +// The ExternalProject for zstd does not build on CMake < 3.7, so we do not +// require it here +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTD, CodecTest, ::testing::Values(Compression::ZSTD)); +#endif + } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/compression_brotli.cc b/cpp/src/arrow/util/compression_brotli.cc index 89d099d6a6067..3d75253e11d9f 100644 --- a/cpp/src/arrow/util/compression_brotli.cc +++ b/cpp/src/arrow/util/compression_brotli.cc @@ -81,9 +81,7 @@ class BrotliDecompressor : public Decompressor { Status BrotliError(const char* msg) { return Status::IOError(msg); } Status BrotliError(BrotliDecoderErrorCode code, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << BrotliDecoderErrorString(code); - return Status::IOError(ss.str()); + return Status::IOError(prefix_msg, BrotliDecoderErrorString(code)); } BrotliDecoderState* state_ = nullptr; diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 0acd54d057218..d157ba6176054 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -18,6 +18,7 @@ #include "arrow/util/compression_lz4.h" #include +#include #include #include @@ -30,6 +31,10 @@ namespace arrow { namespace util { +static Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { + return Status::IOError(prefix_msg, LZ4F_getErrorName(ret)); +} + // ---------------------------------------------------------------------- // Lz4 decompressor implementation @@ -78,12 +83,6 @@ class LZ4Decompressor : public Decompressor { bool IsFinished() override { return finished_; } protected: - Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << LZ4F_getErrorName(ret); - return Status::IOError(ss.str()); - } - LZ4F_dctx* ctx_ = nullptr; bool finished_; }; @@ -124,12 +123,6 @@ class LZ4Compressor : public Compressor { bool* should_retry) override; protected: - Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << LZ4F_getErrorName(ret); - return Status::IOError(ss.str()); - } - LZ4F_cctx* ctx_ = nullptr; LZ4F_preferences_t prefs_; bool first_time_; diff --git a/cpp/src/arrow/util/compression_snappy.cc b/cpp/src/arrow/util/compression_snappy.cc index 1b483e5855209..058593fe13d4e 100644 --- a/cpp/src/arrow/util/compression_snappy.cc +++ b/cpp/src/arrow/util/compression_snappy.cc @@ -57,10 +57,8 @@ Status SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, return Status::IOError("Corrupt snappy compressed data."); } if (output_buffer_len < static_cast(decompressed_size)) { - std::stringstream ss; - ss << "Output buffer size (" << output_buffer_len << ") must be " << decompressed_size - << " or larger."; - return Status::Invalid(ss.str()); + return Status::Invalid("Output buffer size (", output_buffer_len, ") must be ", + decompressed_size, " or larger."); } if (output_len) { *output_len = static_cast(decompressed_size); diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 686dffa640940..736b0ab4f1524 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -76,12 +76,16 @@ static int DecompressionWindowBitsForFormat(GZipCodec::Format format) { } } +static Status ZlibErrorPrefix(const char* prefix_msg, const char* msg) { + return Status::IOError(prefix_msg, (msg) ? msg : "(unknown error)"); +} + // ---------------------------------------------------------------------- // gzip decompressor implementation class GZipDecompressor : public Decompressor { public: - GZipDecompressor() : initialized_(false) {} + GZipDecompressor() : initialized_(false), finished_(false) {} ~GZipDecompressor() override { if (initialized_) { @@ -142,14 +146,7 @@ class GZipDecompressor : public Decompressor { protected: Status ZlibError(const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg; - if (stream_.msg && *stream_.msg) { - ss << stream_.msg; - } else { - ss << "(unknown error)"; - } - return Status::IOError(ss.str()); + return ZlibErrorPrefix(prefix_msg, stream_.msg); } z_stream stream_; @@ -197,14 +194,7 @@ class GZipCompressor : public Compressor { protected: Status ZlibError(const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg; - if (stream_.msg && *stream_.msg) { - ss << stream_.msg; - } else { - ss << "(unknown error)"; - } - return Status::IOError(ss.str()); + return ZlibErrorPrefix(prefix_msg, stream_.msg); } z_stream stream_; @@ -344,9 +334,7 @@ class GZipCodec::GZipCodecImpl { int window_bits = CompressionWindowBitsForFormat(format_); if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, kGZipDefaultCompressionLevel, Z_DEFAULT_STRATEGY)) != Z_OK) { - std::stringstream ss; - ss << "zlib deflateInit failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib deflateInit failed: ", stream_.msg); } compressor_initialized_ = true; return Status::OK(); @@ -367,9 +355,7 @@ class GZipCodec::GZipCodecImpl { // Initialize to run either deflate or zlib/gzip format int window_bits = DecompressionWindowBitsForFormat(format_); if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { - std::stringstream ss; - ss << "zlib inflateInit failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib inflateInit failed: ", stream_.msg); } decompressor_initialized_ = true; return Status::OK(); @@ -401,9 +387,7 @@ class GZipCodec::GZipCodecImpl { // Reset the stream for this block if (inflateReset(&stream_) != Z_OK) { - std::stringstream ss; - ss << "zlib inflateReset failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib inflateReset failed: ", stream_.msg); } int ret = 0; @@ -425,18 +409,13 @@ class GZipCodec::GZipCodecImpl { if (ret == Z_STREAM_END || ret != Z_OK) break; // Failure, buffer was too small - std::stringstream ss; - ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length - << " OutputLength=" << output_buffer_length; - return Status::IOError(ss.str()); + return Status::IOError("Too small a buffer passed to GZipCodec. InputLength=", + input_length, " OutputLength=", output_buffer_length); } // Failure for some other reason if (ret != Z_STREAM_END) { - std::stringstream ss; - ss << "GZipCodec failed: "; - if (stream_.msg != NULL) ss << stream_.msg; - return Status::IOError(ss.str()); + return ZlibErrorPrefix("GZipCodec failed: ", stream_.msg); } if (output_length) { @@ -475,15 +454,12 @@ class GZipCodec::GZipCodecImpl { // small return Status::IOError("zlib deflate failed, output buffer too small"); } - std::stringstream ss; - ss << "zlib deflate failed: " << stream_.msg; - return Status::IOError(ss.str()); + + return ZlibErrorPrefix("zlib deflate failed: ", stream_.msg); } if (deflateReset(&stream_) != Z_OK) { - std::stringstream ss; - ss << "zlib deflateReset failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib deflateReset failed: ", stream_.msg); } // Actual output length diff --git a/cpp/src/arrow/util/compression_zstd.cc b/cpp/src/arrow/util/compression_zstd.cc index 083cae99b9730..de9df8fc9492e 100644 --- a/cpp/src/arrow/util/compression_zstd.cc +++ b/cpp/src/arrow/util/compression_zstd.cc @@ -36,9 +36,7 @@ namespace util { constexpr int kZSTDDefaultCompressionLevel = 1; static Status ZSTDError(size_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << ZSTD_getErrorName(ret); - return Status::IOError(ss.str()); + return Status::IOError(prefix_msg, ZSTD_getErrorName(ret)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 94c270280ea3c..73ac48cf88f20 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -417,8 +417,8 @@ TEST(Decimal128Test, TestFromBigEndian) { auto negated = -value; little_endian = negated.ToBytes(); std::reverse(little_endian.begin(), little_endian.end()); - // Convert all of the bytes since we have to include the sign bit - ASSERT_OK(Decimal128::FromBigEndian(little_endian.data(), 16, &out)); + // The sign bit is looked up in the MSB + ASSERT_OK(Decimal128::FromBigEndian(little_endian.data() + 15 - ii, ii + 1, &out)); ASSERT_EQ(negated, out); // Take the complement and convert to big endian @@ -466,4 +466,108 @@ TEST(Decimal128Test, TestToInteger) { ASSERT_RAISES(Invalid, invalid_int64.ToInteger(&out2)); } +TEST(Decimal128Test, GetWholeAndFraction) { + Decimal128 value("123456"); + Decimal128 whole; + Decimal128 fraction; + int32_t out; + + value.GetWholeAndFraction(0, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(123456, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(0, out); + + value.GetWholeAndFraction(1, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(12345, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(6, out); + + value.GetWholeAndFraction(5, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(1, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(23456, out); + + value.GetWholeAndFraction(7, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(0, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(123456, out); +} + +TEST(Decimal128Test, GetWholeAndFractionNegative) { + Decimal128 value("-123456"); + Decimal128 whole; + Decimal128 fraction; + int32_t out; + + value.GetWholeAndFraction(0, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(-123456, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(0, out); + + value.GetWholeAndFraction(1, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(-12345, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(-6, out); + + value.GetWholeAndFraction(5, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(-1, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(-23456, out); + + value.GetWholeAndFraction(7, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(0, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(-123456, out); +} + +TEST(Decimal128Test, IncreaseScale) { + Decimal128 result; + int32_t out; + + result = Decimal128("1234").IncreaseScaleBy(3); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(1234000, out); + + result = Decimal128("-1234").IncreaseScaleBy(3); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1234000, out); +} + +TEST(Decimal128Test, ReduceScaleAndRound) { + Decimal128 result; + int32_t out; + + result = Decimal128("123456").ReduceScaleBy(1, false); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(12345, out); + + result = Decimal128("123456").ReduceScaleBy(1, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(12346, out); + + result = Decimal128("123451").ReduceScaleBy(1, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(12345, out); + + result = Decimal128("-123789").ReduceScaleBy(2, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1238, out); + + result = Decimal128("-123749").ReduceScaleBy(2, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1237, out); + + result = Decimal128("-123750").ReduceScaleBy(2, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1238, out); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 9d22e005e7276..347a07dcf8ccc 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -29,80 +29,23 @@ #include "arrow/status.h" #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" namespace arrow { -static const Decimal128 ScaleMultipliers[] = { - Decimal128(0LL), - Decimal128(10LL), - Decimal128(100LL), - Decimal128(1000LL), - Decimal128(10000LL), - Decimal128(100000LL), - Decimal128(1000000LL), - Decimal128(10000000LL), - Decimal128(100000000LL), - Decimal128(1000000000LL), - Decimal128(10000000000LL), - Decimal128(100000000000LL), - Decimal128(1000000000000LL), - Decimal128(10000000000000LL), - Decimal128(100000000000000LL), - Decimal128(1000000000000000LL), - Decimal128(10000000000000000LL), - Decimal128(100000000000000000LL), - Decimal128(1000000000000000000LL), - Decimal128(0LL, 10000000000000000000ULL), - Decimal128(5LL, 7766279631452241920ULL), - Decimal128(54LL, 3875820019684212736ULL), - Decimal128(542LL, 1864712049423024128ULL), - Decimal128(5421LL, 200376420520689664ULL), - Decimal128(54210LL, 2003764205206896640ULL), - Decimal128(542101LL, 1590897978359414784ULL), - Decimal128(5421010LL, 15908979783594147840ULL), - Decimal128(54210108LL, 11515845246265065472ULL), - Decimal128(542101086LL, 4477988020393345024ULL), - Decimal128(5421010862LL, 7886392056514347008ULL), - Decimal128(54210108624LL, 5076944270305263616ULL), - Decimal128(542101086242LL, 13875954555633532928ULL), - Decimal128(5421010862427LL, 9632337040368467968ULL), - Decimal128(54210108624275LL, 4089650035136921600ULL), - Decimal128(542101086242752LL, 4003012203950112768ULL), - Decimal128(5421010862427522LL, 3136633892082024448ULL), - Decimal128(54210108624275221LL, 12919594847110692864ULL), - Decimal128(542101086242752217LL, 68739955140067328ULL), - Decimal128(5421010862427522170LL, 687399551400673280ULL)}; - -static constexpr uint64_t kIntMask = 0xFFFFFFFF; -static constexpr auto kCarryBit = static_cast(1) << static_cast(32); +using internal::SafeLeftShift; +using internal::SafeSignedAdd; Decimal128::Decimal128(const std::string& str) : Decimal128() { Status status(Decimal128::FromString(str, this)); DCHECK(status.ok()) << status.message(); } -Decimal128::Decimal128(const uint8_t* bytes) - : Decimal128(BitUtil::FromLittleEndian(reinterpret_cast(bytes)[1]), - BitUtil::FromLittleEndian(reinterpret_cast(bytes)[0])) { -} - -std::array Decimal128::ToBytes() const { - std::array out{{0}}; - ToBytes(out.data()); - return out; -} - -void Decimal128::ToBytes(uint8_t* out) const { - DCHECK_NE(out, nullptr); - reinterpret_cast(out)[0] = BitUtil::ToLittleEndian(low_bits_); - reinterpret_cast(out)[1] = BitUtil::ToLittleEndian(high_bits_); -} - -static constexpr Decimal128 kTenTo36(static_cast(0xC097CE7BC90715), - 0xB34B9F1000000000); -static constexpr Decimal128 kTenTo18(0xDE0B6B3A7640000); +static const Decimal128 kTenTo36(static_cast(0xC097CE7BC90715), + 0xB34B9F1000000000); +static const Decimal128 kTenTo18(0xDE0B6B3A7640000); std::string Decimal128::ToIntegerString() const { Decimal128 remainder; @@ -111,8 +54,7 @@ std::string Decimal128::ToIntegerString() const { // get anything above 10 ** 36 and print it Decimal128 top; - Status s = Divide(kTenTo36, &top, &remainder); - DCHECK(s.ok()) << s.message(); + DCHECK_OK(Divide(kTenTo36, &top, &remainder)); if (top != 0) { buf << static_cast(top); @@ -122,7 +64,7 @@ std::string Decimal128::ToIntegerString() const { // now get anything above 10 ** 18 and print it Decimal128 tail; - s = remainder.Divide(kTenTo18, &top, &tail); + auto s = remainder.Divide(kTenTo18, &top, &tail); if (need_fill || top != 0) { if (need_fill) { @@ -144,11 +86,11 @@ std::string Decimal128::ToIntegerString() const { } Decimal128::operator int64_t() const { - DCHECK(high_bits_ == 0 || high_bits_ == -1) + DCHECK(high_bits() == 0 || high_bits() == -1) << "Trying to cast an Decimal128 greater than the value range of a " "int64_t. high_bits_ must be equal to 0 or -1, got: " - << high_bits_; - return static_cast(low_bits_); + << high_bits(); + return static_cast(low_bits()); } static std::string ToStringNegativeScale(const std::string& str, @@ -337,17 +279,15 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) } // namespace -Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, - int32_t* scale) { +Status Decimal128::FromString(const util::string_view& s, Decimal128* out, + int32_t* precision, int32_t* scale) { if (s.empty()) { return Status::Invalid("Empty string cannot be converted to decimal"); } DecimalComponents dec; if (!ParseDecimalComponents(s.data(), s.size(), &dec)) { - std::stringstream ss; - ss << "The string '" << s << "' is not a valid decimal number"; - return Status::Invalid(ss.str()); + return Status::Invalid("The string '", s, "' is not a valid decimal number"); } std::string exponent_value = dec.exponent_sign + dec.exponent_digits; @@ -381,7 +321,7 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* pr if (scale != nullptr && *scale < 0) { const int32_t abs_scale = std::abs(*scale); - *out *= ScaleMultipliers[abs_scale]; + *out *= GetScaleMultiplier(abs_scale); if (precision != nullptr) { *precision += abs_scale; @@ -393,493 +333,18 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* pr return Status::OK(); } -Decimal128& Decimal128::Negate() { - low_bits_ = ~low_bits_ + 1; - high_bits_ = ~high_bits_; - if (low_bits_ == 0) { - ++high_bits_; - } - return *this; -} - -Decimal128& Decimal128::Abs() { return *this < 0 ? Negate() : *this; } - -Decimal128& Decimal128::operator+=(const Decimal128& right) { - const uint64_t sum = low_bits_ + right.low_bits_; - high_bits_ += right.high_bits_; - if (sum < low_bits_) { - ++high_bits_; - } - low_bits_ = sum; - return *this; -} - -Decimal128& Decimal128::operator-=(const Decimal128& right) { - const uint64_t diff = low_bits_ - right.low_bits_; - high_bits_ -= right.high_bits_; - if (diff > low_bits_) { - --high_bits_; - } - low_bits_ = diff; - return *this; -} - -Decimal128& Decimal128::operator/=(const Decimal128& right) { - Decimal128 remainder; - Status s = Divide(right, this, &remainder); - DCHECK(s.ok()); - return *this; -} - -Decimal128& Decimal128::operator|=(const Decimal128& right) { - low_bits_ |= right.low_bits_; - high_bits_ |= right.high_bits_; - return *this; -} - -Decimal128& Decimal128::operator&=(const Decimal128& right) { - low_bits_ &= right.low_bits_; - high_bits_ &= right.high_bits_; - return *this; -} - -Decimal128& Decimal128::operator<<=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - high_bits_ <<= bits; - high_bits_ |= (low_bits_ >> (64 - bits)); - low_bits_ <<= bits; - } else if (bits < 128) { - high_bits_ = static_cast(low_bits_) << (bits - 64); - low_bits_ = 0; - } else { - high_bits_ = 0; - low_bits_ = 0; - } - } - return *this; -} - -Decimal128& Decimal128::operator>>=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - low_bits_ >>= bits; - low_bits_ |= static_cast(high_bits_ << (64 - bits)); - high_bits_ = static_cast(static_cast(high_bits_) >> bits); - } else if (bits < 128) { - low_bits_ = static_cast(high_bits_ >> (bits - 64)); - high_bits_ = static_cast(high_bits_ >= 0L ? 0L : -1L); - } else { - high_bits_ = static_cast(high_bits_ >= 0L ? 0L : -1L); - low_bits_ = static_cast(high_bits_); - } - } - return *this; -} - -Decimal128& Decimal128::operator*=(const Decimal128& right) { - // Break the left and right numbers into 32 bit chunks - // so that we can multiply them without overflow. - const uint64_t L0 = static_cast(high_bits_) >> 32; - const uint64_t L1 = static_cast(high_bits_) & kIntMask; - const uint64_t L2 = low_bits_ >> 32; - const uint64_t L3 = low_bits_ & kIntMask; - - const uint64_t R0 = static_cast(right.high_bits_) >> 32; - const uint64_t R1 = static_cast(right.high_bits_) & kIntMask; - const uint64_t R2 = right.low_bits_ >> 32; - const uint64_t R3 = right.low_bits_ & kIntMask; - - uint64_t product = L3 * R3; - low_bits_ = product & kIntMask; - - uint64_t sum = product >> 32; - - product = L2 * R3; - sum += product; - - product = L3 * R2; - sum += product; - - low_bits_ += sum << 32; - - high_bits_ = static_cast(sum < product ? kCarryBit : 0); - if (sum < product) { - high_bits_ += kCarryBit; - } - - high_bits_ += static_cast(sum >> 32); - high_bits_ += L1 * R3 + L2 * R2 + L3 * R1; - high_bits_ += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; - return *this; -} - -/// Expands the given value into an array of ints so that we can work on -/// it. The array will be converted to an absolute value and the wasNegative -/// flag will be set appropriately. The array will remove leading zeros from -/// the value. -/// \param array an array of length 4 to set with the value -/// \param was_negative a flag for whether the value was original negative -/// \result the output length of the array -static int64_t FillInArray(const Decimal128& value, uint32_t* array, bool& was_negative) { - uint64_t high; - uint64_t low; - const int64_t highbits = value.high_bits(); - const uint64_t lowbits = value.low_bits(); - - if (highbits < 0) { - low = ~lowbits + 1; - high = static_cast(~highbits); - if (low == 0) { - ++high; - } - was_negative = true; - } else { - low = lowbits; - high = static_cast(highbits); - was_negative = false; - } - - if (high != 0) { - if (high > std::numeric_limits::max()) { - array[0] = static_cast(high >> 32); - array[1] = static_cast(high); - array[2] = static_cast(low >> 32); - array[3] = static_cast(low); - return 4; - } - - array[0] = static_cast(high); - array[1] = static_cast(low >> 32); - array[2] = static_cast(low); - return 3; - } - - if (low >= std::numeric_limits::max()) { - array[0] = static_cast(low >> 32); - array[1] = static_cast(low); - return 2; - } - - if (low == 0) { - return 0; - } - - array[0] = static_cast(low); - return 1; -} - -/// Shift the number in the array left by bits positions. -/// \param array the number to shift, must have length elements -/// \param length the number of entries in the array -/// \param bits the number of bits to shift (0 <= bits < 32) -static void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits != 0) { - for (int64_t i = 0; i < length - 1; ++i) { - array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits)); - } - array[length - 1] <<= bits; - } -} - -/// Shift the number in the array right by bits positions. -/// \param array the number to shift, must have length elements -/// \param length the number of entries in the array -/// \param bits the number of bits to shift (0 <= bits < 32) -static void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits != 0) { - for (int64_t i = length - 1; i > 0; --i) { - array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits)); - } - array[0] >>= bits; - } -} - -/// \brief Fix the signs of the result and remainder at the end of the division based on -/// the signs of the dividend and divisor. -static void FixDivisionSigns(Decimal128* result, Decimal128* remainder, - bool dividend_was_negative, bool divisor_was_negative) { - if (dividend_was_negative != divisor_was_negative) { - result->Negate(); - } - - if (dividend_was_negative) { - remainder->Negate(); - } -} - -/// \brief Build a Decimal128 from a list of ints. -static Status BuildFromArray(Decimal128* value, uint32_t* array, int64_t length) { - switch (length) { - case 0: - *value = {static_cast(0)}; - break; - case 1: - *value = {static_cast(array[0])}; - break; - case 2: - *value = {static_cast(0), - (static_cast(array[0]) << 32) + array[1]}; - break; - case 3: - *value = {static_cast(array[0]), - (static_cast(array[1]) << 32) + array[2]}; - break; - case 4: - *value = {(static_cast(array[0]) << 32) + array[1], - (static_cast(array[2]) << 32) + array[3]}; - break; - case 5: - if (array[0] != 0) { - return Status::Invalid("Can't build Decimal128 with 5 ints."); - } - *value = {(static_cast(array[1]) << 32) + array[2], - (static_cast(array[3]) << 32) + array[4]}; - break; - default: - return Status::Invalid("Unsupported length for building Decimal128"); - } - - return Status::OK(); -} - -/// \brief Do a division where the divisor fits into a single 32 bit value. -static Status SingleDivide(const uint32_t* dividend, int64_t dividend_length, - uint32_t divisor, Decimal128* remainder, - bool dividend_was_negative, bool divisor_was_negative, - Decimal128* result) { - uint64_t r = 0; - uint32_t result_array[5]; - for (int64_t j = 0; j < dividend_length; j++) { - r <<= 32; - r += dividend[j]; - result_array[j] = static_cast(r / divisor); - r %= divisor; - } - RETURN_NOT_OK(BuildFromArray(result, result_array, dividend_length)); - *remainder = static_cast(r); - FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative); - return Status::OK(); -} - -Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, - Decimal128* remainder) const { - // Split the dividend and divisor into integer pieces so that we can - // work on them. - uint32_t dividend_array[5]; - uint32_t divisor_array[4]; - bool dividend_was_negative; - bool divisor_was_negative; - // leave an extra zero before the dividend - dividend_array[0] = 0; - int64_t dividend_length = - FillInArray(*this, dividend_array + 1, dividend_was_negative) + 1; - int64_t divisor_length = FillInArray(divisor, divisor_array, divisor_was_negative); - - // Handle some of the easy cases. - if (dividend_length <= divisor_length) { - *remainder = *this; - *result = 0; - return Status::OK(); - } - - if (divisor_length == 0) { - return Status::Invalid("Division by 0 in Decimal128"); - } - - if (divisor_length == 1) { - return SingleDivide(dividend_array, dividend_length, divisor_array[0], remainder, - dividend_was_negative, divisor_was_negative, result); - } - - int64_t result_length = dividend_length - divisor_length; - uint32_t result_array[4]; - - // Normalize by shifting both by a multiple of 2 so that - // the digit guessing is better. The requirement is that - // divisor_array[0] is greater than 2**31. - int64_t normalize_bits = BitUtil::CountLeadingZeros(divisor_array[0]); - ShiftArrayLeft(divisor_array, divisor_length, normalize_bits); - ShiftArrayLeft(dividend_array, dividend_length, normalize_bits); - - // compute each digit in the result - for (int64_t j = 0; j < result_length; ++j) { - // Guess the next digit. At worst it is two too large - uint32_t guess = std::numeric_limits::max(); - const auto high_dividend = - static_cast(dividend_array[j]) << 32 | dividend_array[j + 1]; - if (dividend_array[j] != divisor_array[0]) { - guess = static_cast(high_dividend / divisor_array[0]); - } - - // catch all of the cases where guess is two too large and most of the - // cases where it is one too large - auto rhat = static_cast(high_dividend - - guess * static_cast(divisor_array[0])); - while (static_cast(divisor_array[1]) * guess > - (static_cast(rhat) << 32) + dividend_array[j + 2]) { - --guess; - rhat += divisor_array[0]; - if (static_cast(rhat) < divisor_array[0]) { - break; - } - } - - // subtract off the guess * divisor from the dividend - uint64_t mult = 0; - for (int64_t i = divisor_length - 1; i >= 0; --i) { - mult += static_cast(guess) * divisor_array[i]; - uint32_t prev = dividend_array[j + i + 1]; - dividend_array[j + i + 1] -= static_cast(mult); - mult >>= 32; - if (dividend_array[j + i + 1] > prev) { - ++mult; - } - } - uint32_t prev = dividend_array[j]; - dividend_array[j] -= static_cast(mult); - - // if guess was too big, we add back divisor - if (dividend_array[j] > prev) { - --guess; - uint32_t carry = 0; - for (int64_t i = divisor_length - 1; i >= 0; --i) { - const auto sum = - static_cast(divisor_array[i]) + dividend_array[j + i + 1] + carry; - dividend_array[j + i + 1] = static_cast(sum); - carry = static_cast(sum >> 32); - } - dividend_array[j] += carry; - } - - result_array[j] = guess; - } - - // denormalize the remainder - ShiftArrayRight(dividend_array, dividend_length, normalize_bits); - - // return result and remainder - RETURN_NOT_OK(BuildFromArray(result, result_array, result_length)); - RETURN_NOT_OK(BuildFromArray(remainder, dividend_array, dividend_length)); - - FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative); - return Status::OK(); -} - -bool operator==(const Decimal128& left, const Decimal128& right) { - return left.high_bits() == right.high_bits() && left.low_bits() == right.low_bits(); -} - -bool operator!=(const Decimal128& left, const Decimal128& right) { - return !operator==(left, right); -} - -bool operator<(const Decimal128& left, const Decimal128& right) { - return left.high_bits() < right.high_bits() || - (left.high_bits() == right.high_bits() && left.low_bits() < right.low_bits()); -} - -bool operator<=(const Decimal128& left, const Decimal128& right) { - return !operator>(left, right); -} - -bool operator>(const Decimal128& left, const Decimal128& right) { - return operator<(right, left); -} - -bool operator>=(const Decimal128& left, const Decimal128& right) { - return !operator<(left, right); -} - -Decimal128 operator-(const Decimal128& operand) { - Decimal128 result(operand.high_bits(), operand.low_bits()); - return result.Negate(); -} - -Decimal128 operator~(const Decimal128& operand) { - Decimal128 result(~operand.high_bits(), ~operand.low_bits()); - return result; -} - -Decimal128 operator+(const Decimal128& left, const Decimal128& right) { - Decimal128 result(left.high_bits(), left.low_bits()); - result += right; - return result; -} - -Decimal128 operator-(const Decimal128& left, const Decimal128& right) { - Decimal128 result(left.high_bits(), left.low_bits()); - result -= right; - return result; -} - -Decimal128 operator*(const Decimal128& left, const Decimal128& right) { - Decimal128 result(left.high_bits(), left.low_bits()); - result *= right; - return result; -} - -Decimal128 operator/(const Decimal128& left, const Decimal128& right) { - Decimal128 remainder; - Decimal128 result; - Status s = left.Divide(right, &result, &remainder); - DCHECK(s.ok()); - return result; -} - -Decimal128 operator%(const Decimal128& left, const Decimal128& right) { - Decimal128 remainder; - Decimal128 result; - Status s = left.Divide(right, &result, &remainder); - DCHECK(s.ok()); - return remainder; -} - -static bool RescaleWouldCauseDataLoss(const Decimal128& value, int32_t delta_scale, - int32_t abs_delta_scale, Decimal128* result) { - Decimal128 multiplier(ScaleMultipliers[abs_delta_scale]); - - if (delta_scale < 0) { - DCHECK_NE(multiplier, 0); - Decimal128 remainder; - Status status = value.Divide(multiplier, result, &remainder); - DCHECK(status.ok()) << status.message(); - return remainder != 0; - } - - *result = value * multiplier; - return (value < 0) ? *result > value : *result < value; +Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, + int32_t* scale) { + return FromString(util::string_view(s), out, precision, scale); } -Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, - Decimal128* out) const { - DCHECK_NE(out, nullptr) << "out is nullptr"; - DCHECK_NE(original_scale, new_scale) << "original_scale != new_scale"; - - const int32_t delta_scale = new_scale - original_scale; - const int32_t abs_delta_scale = std::abs(delta_scale); - - DCHECK_GE(abs_delta_scale, 1); - DCHECK_LE(abs_delta_scale, 38); - - Decimal128 result(*this); - const bool rescale_would_cause_data_loss = - RescaleWouldCauseDataLoss(result, delta_scale, abs_delta_scale, out); - - // Fail if we overflow or truncate - if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) { - std::stringstream buf; - buf << "Rescaling decimal value " << ToString(original_scale) - << " from original scale of " << original_scale << " to new scale of " - << new_scale << " would cause data loss"; - return Status::Invalid(buf.str()); - } - - return Status::OK(); +Status Decimal128::FromString(const char* s, Decimal128* out, int32_t* precision, + int32_t* scale) { + return FromString(util::string_view(s), out, precision, scale); } // Helper function used by Decimal128::FromBigEndian -static inline uint64_t FromBigEndian(const uint8_t* bytes, int32_t length) { +static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { // We don't bounds check the length here because this is called by // FromBigEndian that has a Decimal128 as its out parameters and // that function is already checking the length of the bytes and only @@ -896,47 +361,77 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 static constexpr int32_t kMinDecimalBytes = 1; static constexpr int32_t kMaxDecimalBytes = 16; - int64_t high; - uint64_t low; + int64_t high, low; if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { - std::ostringstream stream; - stream << "Length of byte array passed to Decimal128::FromBigEndian "; - stream << "was " << length << ", but must be between "; - stream << kMinDecimalBytes << " and " << kMaxDecimalBytes; - return Status::Invalid(stream.str()); + return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ", + "was ", length, ", but must be between ", kMinDecimalBytes, + " and ", kMaxDecimalBytes); } - /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the - /// sign bit. + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. const bool is_negative = static_cast(bytes[0]) < 0; - /// Sign extend the low bits if necessary - low = UINT64_MAX * (is_negative && length < 8); - high = -1 * (is_negative && length < kMaxDecimalBytes); - - /// Stop byte of the high bytes + // 1. Extract the high bytes + // Stop byte of the high bytes const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = UInt64FromBigEndian(bytes, high_bits_offset); - /// Shift left enough bits to make room for the incoming int64_t - high <<= high_bits_offset * CHAR_BIT; - - /// Preserve the upper bits by inplace OR-ing the int64_t - uint64_t value = arrow::FromBigEndian(bytes, high_bits_offset); - high |= value; + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } - /// Stop byte of the low bytes + // 2. Extract the low bytes + // Stop byte of the low bytes const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = + UInt64FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - /// Shift left enough bits to make room for the incoming uint64_t - low <<= low_bits_offset * CHAR_BIT; - - /// Preserve the upper bits by inplace OR-ing the uint64_t - value = arrow::FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - low |= value; + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } - *out = Decimal128(high, low); + *out = Decimal128(high, static_cast(low)); return Status::OK(); } +Status Decimal128::ToArrowStatus(DecimalStatus dstatus) const { + Status status; + + switch (dstatus) { + case DecimalStatus::kSuccess: + status = Status::OK(); + break; + + case DecimalStatus::kDivideByZero: + status = Status::Invalid("Division by 0 in Decimal128"); + break; + + case DecimalStatus::kOverflow: + status = Status::Invalid("Overflow occurred during Decimal128 operation."); + break; + + case DecimalStatus::kRescaleDataLoss: + status = Status::Invalid("Rescaling decimal value would cause data loss"); + break; + } + return status; +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 26b82a42f70a7..4c61a1736d04e 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_DECIMAL_H -#define ARROW_DECIMAL_H +#pragma once #include #include @@ -26,9 +25,8 @@ #include #include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/type_traits.h" -#include "arrow/util/visibility.h" +#include "arrow/util/basic_decimal.h" +#include "arrow/util/string_view.h" namespace arrow { @@ -39,80 +37,47 @@ namespace arrow { /// Semi-numerical Algorithms section 4.3.1. /// /// Adapted from the Apache ORC C++ implementation -class ARROW_EXPORT Decimal128 { +/// +/// The implementation is split into two parts : +/// +/// 1. BasicDecimal128 +/// - can be safely compiled to IR without references to libstdc++. +/// 2. Decimal128 +/// - has additional functionality on top of BasicDecimal128 to deal with +/// strings and streams. +class ARROW_EXPORT Decimal128 : public BasicDecimal128 { public: - /// \brief Create an Decimal128 from the two's complement representation. - constexpr Decimal128(int64_t high, uint64_t low) noexcept - : low_bits_(low), high_bits_(high) {} - - /// \brief Empty constructor creates an Decimal128 with a value of 0. - constexpr Decimal128() noexcept : Decimal128(0, 0) {} + /// \cond FALSE + // (need to avoid a duplicate definition in Sphinx) + using BasicDecimal128::BasicDecimal128; + /// \endcond - /// \brief Convert any integer value into an Decimal128. - template ::value, T>::type> - constexpr Decimal128(T value) noexcept - : Decimal128(static_cast(value) >= 0 ? 0 : -1, - static_cast(value)) {} + /// \brief constructor creates a Decimal128 from a BasicDecimal128. + constexpr Decimal128(const BasicDecimal128& value) noexcept : BasicDecimal128(value) {} /// \brief Parse the number from a base 10 string representation. explicit Decimal128(const std::string& value); - /// \brief Create an Decimal128 from an array of bytes. Bytes are assumed to be in - /// little endian byte order. - explicit Decimal128(const uint8_t* bytes); - - /// \brief Negate the current value - Decimal128& Negate(); - - /// \brief Absolute value - Decimal128& Abs(); - - /// \brief Add a number to this one. The result is truncated to 128 bits. - Decimal128& operator+=(const Decimal128& right); + /// \brief Empty constructor creates a Decimal128 with a value of 0. + // This is required on some older compilers. + constexpr Decimal128() noexcept : BasicDecimal128() {} - /// \brief Subtract a number from this one. The result is truncated to 128 bits. - Decimal128& operator-=(const Decimal128& right); - - /// \brief Multiply this number by another number. The result is truncated to 128 bits. - Decimal128& operator*=(const Decimal128& right); - - /// Divide this number by right and return the result. This operation is - /// not destructive. + /// Divide this number by right and return the result. + /// + /// This operation is not destructive. /// The answer rounds to zero. Signs work like: /// 21 / 5 -> 4, 1 /// -21 / 5 -> -4, -1 /// 21 / -5 -> -4, 1 /// -21 / -5 -> 4, -1 - /// \param divisor the number to divide by - /// \param remainder the remainder after the division + /// \param[in] divisor the number to divide by + /// \param[out] result the quotient + /// \param[out] remainder the remainder after the division Status Divide(const Decimal128& divisor, Decimal128* result, - Decimal128* remainder) const; - - /// \brief In-place division. - Decimal128& operator/=(const Decimal128& right); - - /// \brief Bitwise or between two Decimal128. - Decimal128& operator|=(const Decimal128& right); - - /// \brief Bitwise and between two Decimal128. - Decimal128& operator&=(const Decimal128& right); - - /// \brief Shift left by the given number of bits. - Decimal128& operator<<=(uint32_t bits); - - /// \brief Shift right by the given number of bits. Negative values will - Decimal128& operator>>=(uint32_t bits); - - /// \brief Get the high bits of the two's complement representation of the number. - inline int64_t high_bits() const { return high_bits_; } - - /// \brief Get the low bits of the two's complement representation of the number. - inline uint64_t low_bits() const { return low_bits_; } - - /// \brief Return the raw bytes of the value in little-endian byte order. - std::array ToBytes() const; - void ToBytes(uint8_t* out) const; + Decimal128* remainder) const { + auto dstatus = BasicDecimal128::Divide(divisor, result, remainder); + return ToArrowStatus(dstatus); + } /// \brief Convert the Decimal128 value to a base 10 decimal string with the given /// scale. @@ -124,18 +89,25 @@ class ARROW_EXPORT Decimal128 { /// \brief Cast this value to an int64_t. explicit operator int64_t() const; - /// \brief Convert a decimal string to an Decimal128 value, optionally including + /// \brief Convert a decimal string to a Decimal128 value, optionally including /// precision and scale if they're passed in and not null. + static Status FromString(const util::string_view& s, Decimal128* out, + int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); static Status FromString(const std::string& s, Decimal128* out, int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const char* s, Decimal128* out, int32_t* precision = NULLPTR, + int32_t* scale = NULLPTR); - /// \brief Convert from a big endian byte representation. The length must be - /// between 1 and 16 + /// \brief Convert from a big-endian byte representation. The length must be + /// between 1 and 16. /// \return error status if the length is an invalid value static Status FromBigEndian(const uint8_t* data, int32_t length, Decimal128* out); /// \brief Convert Decimal128 from one scale to another - Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const; + Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const { + auto dstatus = BasicDecimal128::Rescale(original_scale, new_scale, out); + return ToArrowStatus(dstatus); + } /// \brief Convert to a signed integer template > @@ -144,34 +116,16 @@ class ARROW_EXPORT Decimal128 { constexpr auto max_value = std::numeric_limits::max(); const auto& self = *this; if (self < min_value || self > max_value) { - std::stringstream buf; - buf << "Invalid cast from Decimal128 to " << sizeof(T) << " byte integer"; - return Status::Invalid(buf.str()); + return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T), + " byte integer"); } - *out = static_cast(low_bits_); + *out = static_cast(low_bits()); return Status::OK(); } private: - uint64_t low_bits_; - int64_t high_bits_; + /// Converts internal error code to Status + Status ToArrowStatus(DecimalStatus dstatus) const; }; -ARROW_EXPORT bool operator==(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT bool operator!=(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT bool operator<(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT bool operator<=(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT bool operator>(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT bool operator>=(const Decimal128& left, const Decimal128& right); - -ARROW_EXPORT Decimal128 operator-(const Decimal128& operand); -ARROW_EXPORT Decimal128 operator~(const Decimal128& operand); -ARROW_EXPORT Decimal128 operator+(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT Decimal128 operator-(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT Decimal128 operator*(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT Decimal128 operator/(const Decimal128& left, const Decimal128& right); -ARROW_EXPORT Decimal128 operator%(const Decimal128& left, const Decimal128& right); - } // namespace arrow - -#endif // ARROW_DECIMAL_H diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h index fd69cb9438c12..509b7e64035db 100644 --- a/cpp/src/arrow/util/hash-util.h +++ b/cpp/src/arrow/util/hash-util.h @@ -134,10 +134,13 @@ class HashUtil { switch (nbytes) { case 3: h1 = HW_crc32_u8(h1, p[3]); + // fallthrough case 2: h2 = HW_crc32_u8(h2, p[2]); + // fallthrough case 1: h1 = HW_crc32_u8(h1, p[1]); + // fallthrough case 0: break; default: diff --git a/cpp/src/arrow/util/hashing-benchmark.cc b/cpp/src/arrow/util/hashing-benchmark.cc index 7d91f0f536ac1..ee70391815084 100644 --- a/cpp/src/arrow/util/hashing-benchmark.cc +++ b/cpp/src/arrow/util/hashing-benchmark.cc @@ -49,13 +49,13 @@ static std::vector MakeStrings(int32_t n_values, int32_t min_length // Generate strings between 2 and 20 bytes std::uniform_int_distribution length_dist(min_length, max_length); - std::independent_bits_engine bytes_gen(42); + std::independent_bits_engine bytes_gen(42); std::generate(values.begin(), values.end(), [&]() { auto length = length_dist(gen); std::string s(length, 'X'); for (int32_t i = 0; i < length; ++i) { - s[i] = bytes_gen(); + s[i] = static_cast(bytes_gen()); } return s; }); @@ -74,6 +74,7 @@ static void BM_HashIntegers(benchmark::State& state) { // NOLINT non-const refe benchmark::DoNotOptimize(total); } state.SetBytesProcessed(2 * state.iterations() * values.size() * sizeof(int64_t)); + state.SetItemsProcessed(2 * state.iterations() * values.size()); } static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const reference @@ -92,6 +93,7 @@ static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const benchmark::DoNotOptimize(total); } state.SetBytesProcessed(2 * state.iterations() * total_size); + state.SetItemsProcessed(2 * state.iterations() * values.size()); } static void BM_HashSmallStrings(benchmark::State& state) { // NOLINT non-const reference diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index ee368fb4e314c..3dde0beeb194e 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -102,6 +102,18 @@ struct ScalarHelper +struct ScalarHelper< + Scalar, AlgNum, + typename std::enable_if::value>::type> + : public ScalarHelperBase { + // ScalarHelper specialization for util::string_view + + static hash_t ComputeHash(const util::string_view& value) { + return ComputeStringHash(value.data(), static_cast(value.size())); + } +}; + template struct ScalarHelper::value>::type> @@ -332,7 +344,7 @@ class ScalarMemoTable { explicit ScalarMemoTable(int64_t entries = 0) : hash_table_(static_cast(entries)) {} - int32_t Get(const Scalar value) const { + int32_t Get(const Scalar& value) const { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(payload->value, value); }; @@ -346,7 +358,7 @@ class ScalarMemoTable { } template - int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) { + int32_t GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found) { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(value, payload->value); }; @@ -364,7 +376,7 @@ class ScalarMemoTable { return memo_index; } - int32_t GetOrInsert(const Scalar value) { + int32_t GetOrInsert(const Scalar& value) { return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); } @@ -389,6 +401,7 @@ class ScalarMemoTable { Scalar value; int32_t memo_index; }; + using HashTableType = HashTableTemplateType; using HashTableEntry = typename HashTableType::Entry; HashTableType hash_table_; @@ -621,9 +634,11 @@ class BinaryMemoTable { struct Payload { int32_t memo_index; }; + using HashTableType = HashTable; using HashTableEntry = typename HashTable::Entry; HashTableType hash_table_; + std::vector offsets_; std::string values_; @@ -651,25 +666,6 @@ template struct HashTraits> { using c_type = typename T::c_type; using MemoTableType = SmallScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template @@ -677,25 +673,6 @@ struct HashTraits< T, typename std::enable_if::value && !is_8bit_int::value>::type> { using c_type = typename T::c_type; using MemoTableType = ScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 51fd96e4ea25a..5eba531d874e0 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -17,14 +17,12 @@ #include #include -#include #include #include #include #include -#include "arrow/test-util.h" #include "arrow/util/int-util.h" namespace arrow { @@ -375,5 +373,14 @@ TEST(IntWidth, NullsMany) { } } +TEST(TransposeInts, Int8ToInt64) { + std::vector src = {1, 3, 5, 0, 3, 2}; + std::vector transpose_map = {1111, 2222, 3333, 4444, 5555, 6666, 7777}; + std::vector dest(src.size()); + + TransposeInts(src.data(), dest.data(), 6, transpose_map.data()); + ASSERT_EQ(dest, std::vector({2222, 4444, 6666, 1111, 4444, 3333})); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.cc b/cpp/src/arrow/util/int-util.cc index ced1cd1c20da2..d81044b3cafdc 100644 --- a/cpp/src/arrow/util/int-util.cc +++ b/cpp/src/arrow/util/int-util.cc @@ -402,5 +402,45 @@ void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) { memcpy(dest, source, length * sizeof(int64_t)); } +template +void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length, + const int32_t* transpose_map) { + while (length >= 4) { + dest[0] = static_cast(transpose_map[src[0]]); + dest[1] = static_cast(transpose_map[src[1]]); + dest[2] = static_cast(transpose_map[src[2]]); + dest[3] = static_cast(transpose_map[src[3]]); + length -= 4; + src += 4; + dest += 4; + } + while (length > 0) { + *dest++ = static_cast(transpose_map[*src++]); + --length; + } +} + +#define INSTANTIATE(SRC, DEST) \ + template ARROW_EXPORT void TransposeInts( \ + const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map); + +#define INSTANTIATE_ALL_DEST(DEST) \ + INSTANTIATE(int8_t, DEST) \ + INSTANTIATE(int16_t, DEST) \ + INSTANTIATE(int32_t, DEST) \ + INSTANTIATE(int64_t, DEST) + +#define INSTANTIATE_ALL() \ + INSTANTIATE_ALL_DEST(int8_t) \ + INSTANTIATE_ALL_DEST(int16_t) \ + INSTANTIATE_ALL_DEST(int32_t) \ + INSTANTIATE_ALL_DEST(int64_t) + +INSTANTIATE_ALL() + +#undef INSTANTIATE +#undef INSTANTIATE_ALL +#undef INSTANTIATE_ALL_DEST + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h index 68355d34549ac..d3ae09f75cfa6 100644 --- a/cpp/src/arrow/util/int-util.h +++ b/cpp/src/arrow/util/int-util.h @@ -19,6 +19,7 @@ #define ARROW_UTIL_INT_UTIL_H #include +#include #include "arrow/util/visibility.h" @@ -63,6 +64,25 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length); ARROW_EXPORT void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length); +template +ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, + const int32_t* transpose_map); + +/// Signed addition with well-defined behaviour on overflow (as unsigned) +template +SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) + + static_cast(v)); +} + +/// Signed left shift with well-defined behaviour on negative numbers or overflow +template +SignedInt SafeLeftShift(SignedInt u, Shift shift) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) << shift); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc index 74ad80691da94..5d67fe87fa0e5 100644 --- a/cpp/src/arrow/util/io-util.cc +++ b/cpp/src/arrow/util/io-util.cc @@ -113,10 +113,8 @@ static inline Status CheckFileOpResult(int ret, int errno_actual, const PlatformFilename& file_name, const char* opname) { if (ret == -1) { - std::stringstream ss; - ss << "Failed to " << opname << " file: " << file_name.string(); - ss << " , error: " << std::strerror(errno_actual); - return Status::IOError(ss.str()); + return Status::IOError("Failed to ", opname, " file: ", file_name.string(), + " , error: ", std::strerror(errno_actual)); } return Status::OK(); } @@ -232,12 +230,18 @@ Status CreatePipe(int fd[2]) { #endif if (ret == -1) { - return Status::IOError(std::string("Error creating pipe: ") + - std::string(strerror(errno))); + return Status::IOError("Error creating pipe: ", std::strerror(errno)); } return Status::OK(); } +static Status StatusFromErrno(const char* prefix) { +#ifdef _WIN32 + errno = __map_mman_error(GetLastError(), EPERM); +#endif + return Status::IOError(prefix, std::strerror(errno)); +} + // // Compatible way to remap a memory map // @@ -251,18 +255,12 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, HANDLE fm, h; if (!UnmapViewOfFile(addr)) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "UnmapViewOfFile failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("UnmapViewOfFile failed: "); } h = reinterpret_cast(_get_osfhandle(fildes)); if (h == INVALID_HANDLE_VALUE) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "cannot get file handle: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("Cannot get file handle: "); } LONG new_size_low = static_cast(new_size & 0xFFFFFFFFL); @@ -272,18 +270,12 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, SetEndOfFile(h); fm = CreateFileMapping(h, NULL, PAGE_READWRITE, 0, 0, ""); if (fm == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "mremap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("CreateFileMapping failed: "); } *new_addr = MapViewOfFile(fm, FILE_MAP_WRITE, 0, 0, new_size); CloseHandle(fm); if (new_addr == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "mremap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("MapViewOfFile failed: "); } return Status::OK(); #else @@ -291,26 +283,26 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, // we have to close the mmap first, truncate the file to the new size // and recreate the mmap if (munmap(addr, old_size) == -1) { - std::stringstream ss; - ss << "munmap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("munmap failed: "); } if (ftruncate(fildes, new_size) == -1) { - std::stringstream ss; - ss << "cannot truncate file: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("ftruncate failed: "); } // we set READ / WRITE flags on the new map, since we could only have // unlarged a RW map in the first place *new_addr = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, fildes, 0); + if (*new_addr == MAP_FAILED) { + return StatusFromErrno("mmap failed: "); + } return Status::OK(); #else if (ftruncate(fildes, new_size) == -1) { - std::stringstream ss; - ss << "file truncate failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("ftruncate failed: "); } *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE); + if (*new_addr == MAP_FAILED) { + return StatusFromErrno("mremap failed: "); + } return Status::OK(); #endif #endif diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 4cce700db970b..5ea78206a73ee 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -18,10 +18,29 @@ #ifndef ARROW_UTIL_LOGGING_H #define ARROW_UTIL_LOGGING_H +#ifdef GANDIVA_IR + +// The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to +// streams or stdc++. So, making the DCHECK calls void in that case. + +#define ARROW_IGNORE_EXPR(expr) ((void)(expr)) + +#define DCHECK(condition) ARROW_IGNORE_EXPR(condition) +#define DCHECK_OK(status) ARROW_IGNORE_EXPR(status) +#define DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1) + +#else // !GANDIVA_IR + #include #include #include +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { @@ -155,6 +174,8 @@ class ARROW_EXPORT ArrowLog : public ArrowLogBase { static void InstallFailureSignalHandler(); private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog); + // Hide the implementation of log provider by void *. // Otherwise, lib user may define the same macro to use the correct header file. void* logging_provider_; @@ -182,5 +203,6 @@ class ARROW_EXPORT Voidify { } // namespace util } // namespace arrow +#endif // GANDIVA_IR #endif // ARROW_UTIL_LOGGING_H diff --git a/cpp/src/arrow/util/machine-benchmark.cc b/cpp/src/arrow/util/machine-benchmark.cc new file mode 100644 index 0000000000000..ad3f413e7f0fd --- /dev/null +++ b/cpp/src/arrow/util/machine-benchmark.cc @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Non-Arrow system benchmarks, provided for convenience. + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +namespace arrow { + +// Generate a vector of indices such as following the indices describes +// a path over the whole vector. The path is randomized to avoid triggering +// automatic prefetching in the CPU. +std::vector RandomPath(int32_t size) { + std::default_random_engine gen(42); + std::vector indices(size); + + for (int32_t i = 0; i < size; ++i) { + indices[i] = i; + } + std::shuffle(indices.begin(), indices.end(), gen); + std::vector path(size, -999999); + int32_t prev; + prev = indices[size - 1]; + for (int32_t i = 0; i < size; ++i) { + int32_t next = indices[i]; + path[prev] = next; + prev = next; + } + return path; +} + +// Cache / main memory latency, depending on the working set size +static void BM_memory_latency(benchmark::State& state) { + const auto niters = static_cast(state.range(0)); + const std::vector path = RandomPath(niters / 4); + + int32_t total = 0; + int32_t index = 0; + for (auto _ : state) { + total += index; + index = path[index]; + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations()); +} + +BENCHMARK(BM_memory_latency)->RangeMultiplier(2)->Range(2 << 10, 2 << 24); + +} // namespace arrow diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 1d188820837fc..ab258252695ab 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -18,6 +18,8 @@ #ifndef ARROW_UTIL_MACROS_H #define ARROW_UTIL_MACROS_H +#define ARROW_STRINGIFY(x) #x + // From Google gutil #ifndef ARROW_DISALLOW_COPY_AND_ASSIGN #define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \ @@ -111,6 +113,15 @@ #endif #endif // !defined(MANUALLY_ALIGNED_STRUCT) +// ---------------------------------------------------------------------- +// Convenience macro disabling a particular UBSan check in a function + +#if defined(__clang__) +#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature))) +#else +#define ARROW_DISABLE_UBSAN(feature) +#endif + // ---------------------------------------------------------------------- // From googletest // (also in parquet-cpp) diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 23e0361235d3e..23e7061ac8738 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -34,7 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/date.h" +#include "arrow/vendored/date.h" namespace arrow { namespace internal { @@ -335,7 +335,10 @@ class StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { return false; } - *out = static_cast(-static_cast(unsigned_value)); + // To avoid both compiler warnings (with unsigned negation) + // and undefined behaviour (with signed negation overflow), + // use the expanded formula for 2's complement negation. + *out = static_cast(~unsigned_value + 1); } else { if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { return false; diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index 88382618653e9..aac1b1523990c 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -193,7 +193,7 @@ void ValidateRle(const vector& values, int bit_width, uint8_t* expected_enc EXPECT_EQ(encoded_len, expected_len); } if (expected_encoding != NULL) { - EXPECT_EQ(memcmp(buffer, expected_encoding, expected_len), 0); + EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0); } // Verify read diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index a97543d5be799..acefc8e3f7583 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -436,6 +436,7 @@ bool RleDecoder::NextCounts() { literal_count_ = (indicator_value >> 1) * 8; } else { repeat_count_ = indicator_value >> 1; + // XXX (ARROW-4018) this is not big-endian compatible bool result = bit_reader_.GetAligned(static_cast(BitUtil::CeilDiv(bit_width_, 8)), reinterpret_cast(¤t_value_)); diff --git a/cpp/src/arrow/util/string_builder.h b/cpp/src/arrow/util/string_builder.h new file mode 100644 index 0000000000000..7b3e10742a9a9 --- /dev/null +++ b/cpp/src/arrow/util/string_builder.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. template + +#ifndef ARROW_UTIL_STRING_BUILDER_H +#define ARROW_UTIL_STRING_BUILDER_H + +#include +#include +#include + +namespace arrow { +namespace util { + +template +void StringBuilderRecursive(std::stringstream& stream, Head&& head) { + stream << head; +} + +template +void StringBuilderRecursive(std::stringstream& stream, Head&& head, Tail&&... tail) { + StringBuilderRecursive(stream, std::forward(head)); + StringBuilderRecursive(stream, std::forward(tail)...); +} + +template +std::string StringBuilder(Args&&... args) { + std::stringstream stream; + + StringBuilderRecursive(stream, std::forward(args)...); + + return stream.str(); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_STRING_BUILDER_H diff --git a/cpp/src/arrow/util/string_view.h b/cpp/src/arrow/util/string_view.h index 2ee594a9e9ad3..a1a813726e4f0 100644 --- a/cpp/src/arrow/util/string_view.h +++ b/cpp/src/arrow/util/string_view.h @@ -18,7 +18,7 @@ #ifndef ARROW_UTIL_STRING_VIEW_H #define ARROW_UTIL_STRING_VIEW_H -#include "arrow/util/string_view/string_view.hpp" +#include "arrow/vendored/string_view.hpp" // IWYU pragma: export namespace arrow { namespace util { diff --git a/cpp/src/arrow/util/task-group.cc b/cpp/src/arrow/util/task-group.cc index 3ea63fc5ad80e..52c40bd46d1d3 100644 --- a/cpp/src/arrow/util/task-group.cc +++ b/cpp/src/arrow/util/task-group.cc @@ -17,9 +17,11 @@ #include "arrow/util/task-group.h" +#include #include #include #include +#include #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" @@ -41,6 +43,8 @@ class SerialTaskGroup : public TaskGroup { Status current_status() override { return status_; } + bool ok() override { return status_.ok(); } + Status Finish() override { if (!finished_) { finished_ = true; @@ -70,7 +74,8 @@ class SerialTaskGroup : public TaskGroup { class ThreadedTaskGroup : public TaskGroup { public: - explicit ThreadedTaskGroup(ThreadPool* thread_pool) : thread_pool_(thread_pool) {} + explicit ThreadedTaskGroup(ThreadPool* thread_pool) + : thread_pool_(thread_pool), nremaining_(0), ok_(true) {} ~ThreadedTaskGroup() override { // Make sure all pending tasks are finished, so that dangling references @@ -79,22 +84,19 @@ class ThreadedTaskGroup : public TaskGroup { } void AppendReal(std::function task) override { - std::lock_guard lock(mutex_); - DCHECK(!finished_); - - if (status_.ok()) { - ++nremaining_; - status_ = thread_pool_->Spawn([&, task]() { - std::unique_lock lock(mutex_); - if (status_.ok()) { - lock.unlock(); + // The hot path is unlocked thanks to atomics + // Only if an error occurs is the lock taken + if (ok_.load(std::memory_order_acquire)) { + nremaining_.fetch_add(1, std::memory_order_acquire); + Status st = thread_pool_->Spawn([this, task]() { + if (ok_.load(std::memory_order_acquire)) { // XXX what about exceptions? Status st = task(); - lock.lock(); - status_ &= st; + UpdateStatus(std::move(st)); } OneTaskDone(); }); + UpdateStatus(std::move(st)); } } @@ -103,15 +105,15 @@ class ThreadedTaskGroup : public TaskGroup { return status_; } + bool ok() override { return ok_.load(); } + Status Finish() override { std::unique_lock lock(mutex_); if (!finished_) { - cv_.wait(lock, [&]() { return nremaining_ == 0; }); + cv_.wait(lock, [&]() { return nremaining_.load() == 0; }); // Current tasks may start other tasks, so only set this when done finished_ = true; if (parent_) { - // Need to lock parent - std::lock_guard parent_lock(parent_->mutex_); parent_->OneTaskDone(); } } @@ -124,26 +126,42 @@ class ThreadedTaskGroup : public TaskGroup { std::lock_guard lock(mutex_); auto child = new ThreadedTaskGroup(thread_pool_); child->parent_ = this; - nremaining_++; + nremaining_.fetch_add(1, std::memory_order_acquire); return std::shared_ptr(child); } protected: + void UpdateStatus(Status&& st) { + // Must be called unlocked, only locks on error + if (ARROW_PREDICT_FALSE(!st.ok())) { + std::lock_guard lock(mutex_); + ok_.store(false, std::memory_order_release); + status_ &= std::move(st); + } + } + void OneTaskDone() { - // We are locked - --nremaining_; - DCHECK_GE(nremaining_, 0); - if (nremaining_ == 0) { + // Can be called unlocked thanks to atomics + auto nremaining = nremaining_.fetch_sub(1, std::memory_order_release) - 1; + DCHECK_GE(nremaining, 0); + if (nremaining == 0) { + // Take the lock so that ~ThreadedTaskGroup cannot destroy cv + // before cv.notify_one() has returned + std::unique_lock lock(mutex_); cv_.notify_one(); } } + // These members are usable unlocked ThreadPool* thread_pool_; + std::atomic nremaining_; + std::atomic ok_; + + // These members use locking std::mutex mutex_; std::condition_variable cv_; Status status_; bool finished_ = false; - int32_t nremaining_ = 0; ThreadedTaskGroup* parent_ = nullptr; }; diff --git a/cpp/src/arrow/util/task-group.h b/cpp/src/arrow/util/task-group.h index 450b6da5884fc..390d9476e59bd 100644 --- a/cpp/src/arrow/util/task-group.h +++ b/cpp/src/arrow/util/task-group.h @@ -59,7 +59,7 @@ class ARROW_EXPORT TaskGroup { virtual Status current_status() = 0; /// Whether some tasks have already failed. Non-blocking , useful for stopping early. - bool ok() { return current_status().ok(); } + virtual bool ok() = 0; /// How many tasks can typically be executed in parallel. /// This is only a hint, useful for testing or debugging. diff --git a/cpp/src/arrow/util/thread-pool-benchmark.cc b/cpp/src/arrow/util/thread-pool-benchmark.cc new file mode 100644 index 0000000000000..8d855d3acba09 --- /dev/null +++ b/cpp/src/arrow/util/thread-pool-benchmark.cc @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/util/task-group.h" +#include "arrow/util/thread-pool.h" + +namespace arrow { +namespace internal { + +struct Workload { + explicit Workload(int32_t size) : size_(size), data_(kDataSize) { + std::default_random_engine gen(42); + std::uniform_int_distribution dist(0, std::numeric_limits::max()); + std::generate(data_.begin(), data_.end(), [&]() { return dist(gen); }); + } + + void operator()(); + + private: + static constexpr int32_t kDataSize = 32; + + int32_t size_; + std::vector data_; +}; + +void Workload::operator()() { + uint64_t result = 0; + for (int32_t i = 0; i < size_ / kDataSize; ++i) { + for (const auto v : data_) { + result = (result << (v % 64)) - v; + } + } + benchmark::DoNotOptimize(result); +} + +struct Task { + explicit Task(int32_t size) : workload_(size) {} + + Status operator()() { + workload_(); + return Status::OK(); + } + + private: + Workload workload_; +}; + +// This benchmark simply provides a baseline indicating the raw cost of our workload +// depending on the workload size. Number of items / second in this (serial) +// benchmark can be compared to the numbers obtained in BM_ThreadPoolSpawn. +static void BM_WorkloadCost(benchmark::State& state) { + const auto workload_size = static_cast(state.range(0)); + + Workload workload(workload_size); + for (auto _ : state) { + workload(); + } + + state.SetItemsProcessed(state.iterations()); +} + +// Benchmark ThreadPool::Spawn +static void BM_ThreadPoolSpawn(benchmark::State& state) { + const auto nthreads = static_cast(state.range(0)); + const auto workload_size = static_cast(state.range(1)); + + Workload workload(workload_size); + + // Spawn enough tasks to make the pool start up overhead negligible + const int32_t nspawns = 200000000 / workload_size + 1; + + for (auto _ : state) { + state.PauseTiming(); + std::shared_ptr pool; + ABORT_NOT_OK(ThreadPool::Make(nthreads, &pool)); + state.ResumeTiming(); + + for (int32_t i = 0; i < nspawns; ++i) { + // Pass the task by reference to avoid copying it around + ABORT_NOT_OK(pool->Spawn(std::ref(workload))); + } + + // Wait for all tasks to finish + ABORT_NOT_OK(pool->Shutdown(true /* wait */)); + state.PauseTiming(); + pool.reset(); + state.ResumeTiming(); + } + state.SetItemsProcessed(state.iterations() * nspawns); +} + +// Benchmark serial TaskGroup +static void BM_SerialTaskGroup(benchmark::State& state) { + const auto workload_size = static_cast(state.range(0)); + + Task task(workload_size); + + const int32_t nspawns = 10000000 / workload_size + 1; + + for (auto _ : state) { + auto task_group = TaskGroup::MakeSerial(); + for (int32_t i = 0; i < nspawns; ++i) { + // Pass the task by reference to avoid copying it around + task_group->Append(std::ref(task)); + } + ABORT_NOT_OK(task_group->Finish()); + } + state.SetItemsProcessed(state.iterations() * nspawns); +} + +// Benchmark threaded TaskGroup +static void BM_ThreadedTaskGroup(benchmark::State& state) { + const auto nthreads = static_cast(state.range(0)); + const auto workload_size = static_cast(state.range(1)); + + std::shared_ptr pool; + ABORT_NOT_OK(ThreadPool::Make(nthreads, &pool)); + + Task task(workload_size); + + const int32_t nspawns = 10000000 / workload_size + 1; + + for (auto _ : state) { + auto task_group = TaskGroup::MakeThreaded(pool.get()); + for (int32_t i = 0; i < nspawns; ++i) { + // Pass the task by reference to avoid copying it around + task_group->Append(std::ref(task)); + } + ABORT_NOT_OK(task_group->Finish()); + } + ABORT_NOT_OK(pool->Shutdown(true /* wait */)); + + state.SetItemsProcessed(state.iterations() * nspawns); +} + +static const int32_t kWorkloadSizes[] = {1000, 10000, 100000}; + +static void WorkloadCost_Customize(benchmark::internal::Benchmark* b) { + for (const auto w : kWorkloadSizes) { + b->Args({w}); + } + b->ArgNames({"task_cost"}); +} + +static void ThreadPoolSpawn_Customize(benchmark::internal::Benchmark* b) { + for (const int32_t w : kWorkloadSizes) { + for (const int nthreads : {1, 2, 4, 8}) { + b->Args({nthreads, w}); + } + } + b->ArgNames({"threads", "task_cost"}); +} + +static const int kRepetitions = 1; + +BENCHMARK(BM_WorkloadCost)->Repetitions(kRepetitions)->Apply(WorkloadCost_Customize); + +BENCHMARK(BM_ThreadPoolSpawn) + ->UseRealTime() + ->Repetitions(kRepetitions) + ->Apply(ThreadPoolSpawn_Customize); + +BENCHMARK(BM_SerialTaskGroup) + ->UseRealTime() + ->Repetitions(kRepetitions) + ->Apply(WorkloadCost_Customize); + +BENCHMARK(BM_ThreadedTaskGroup) + ->UseRealTime() + ->Repetitions(kRepetitions) + ->Apply(ThreadPoolSpawn_Customize); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/thread-pool-test.cc b/cpp/src/arrow/util/thread-pool-test.cc index 6d7b9e230f080..c0deb20ccdde1 100644 --- a/cpp/src/arrow/util/thread-pool-test.cc +++ b/cpp/src/arrow/util/thread-pool-test.cc @@ -298,7 +298,8 @@ TEST_F(TestThreadPool, Submit) { // Test fork safety on Unix -#if !(defined(_WIN32) || defined(ARROW_VALGRIND)) +#if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ + defined(THREAD_SANITIZER)) TEST_F(TestThreadPool, ForkSafety) { pid_t child_pid; int child_status; diff --git a/cpp/src/arrow/util/thread-pool.cc b/cpp/src/arrow/util/thread-pool.cc index 751b264b42f59..17ad9c4972fa2 100644 --- a/cpp/src/arrow/util/thread-pool.cc +++ b/cpp/src/arrow/util/thread-pool.cc @@ -34,6 +34,9 @@ namespace internal { struct ThreadPool::State { State() : desired_capacity_(0), please_shutdown_(false), quick_shutdown_(false) {} + // NOTE: in case locking becomes too expensive, we can investigate lock-free FIFOs + // such as https://github.com/cameron314/concurrentqueue + std::mutex mutex_; std::condition_variable cv_; std::condition_variable cv_shutdown_; diff --git a/cpp/src/arrow/util/trie-benchmark.cc b/cpp/src/arrow/util/trie-benchmark.cc new file mode 100644 index 0000000000000..acc2892689ff4 --- /dev/null +++ b/cpp/src/arrow/util/trie-benchmark.cc @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/util/trie.h" + +namespace arrow { +namespace internal { + +static inline bool InlinedNullLookup(util::string_view s) { + // An inlined version of trie lookup for a specific set of strings + // (see AllNulls()) + auto size = s.length(); + auto data = s.data(); + if (size == 0) { + return false; + } + if (size == 1) { + return false; + } + + auto chars = reinterpret_cast(data); + auto first = chars[0]; + auto second = chars[1]; + switch (first) { + case 'N': { + // "NA", "N/A", "NaN", "NULL" + if (size == 2) { + return second == 'A'; + } + auto third = chars[2]; + if (size == 3) { + return (second == '/' && third == 'A') || (second == 'a' && third == 'N'); + } + if (size == 4) { + return (second == 'U' && third == 'L' && chars[3] == 'L'); + } + return false; + } + case 'n': { + // "n/a", "nan", "null" + if (size == 2) { + return false; + } + auto third = chars[2]; + if (size == 3) { + return (second == '/' && third == 'a') || (second == 'a' && third == 'n'); + } + if (size == 4) { + return (second == 'u' && third == 'l' && chars[3] == 'l'); + } + return false; + } + case '1': { + // '1.#IND', '1.#QNAN' + if (size == 6) { + // '#' is the most unlikely char here, check it first + return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'I' && + chars[4] == 'N' && chars[5] == 'D'); + } + if (size == 7) { + return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'Q' && + chars[4] == 'N' && chars[5] == 'A' && chars[6] == 'N'); + } + return false; + } + case '-': { + switch (second) { + case 'N': + // "-NaN" + return (size == 4 && chars[2] == 'a' && chars[3] == 'N'); + case 'n': + // "-nan" + return (size == 4 && chars[2] == 'a' && chars[3] == 'n'); + case '1': + // "-1.#IND", "-1.#QNAN" + if (size == 7) { + return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'I' && + chars[5] == 'N' && chars[6] == 'D'); + } + if (size == 8) { + return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'Q' && + chars[5] == 'N' && chars[6] == 'A' && chars[7] == 'N'); + } + return false; + default: + return false; + } + } + case '#': { + // "#N/A", "#N/A N/A", "#NA" + if (size < 3 || chars[1] != 'N') { + return false; + } + auto third = chars[2]; + if (size == 3) { + return third == 'A'; + } + if (size == 4) { + return third == '/' && chars[3] == 'A'; + } + if (size == 8) { + return std::memcmp(data + 2, "/A N/A", 5) == 0; + } + return false; + } + default: + return false; + } +} + +std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +Trie MakeNullsTrie() { + auto nulls = AllNulls(); + + TrieBuilder builder; + for (const auto& str : AllNulls()) { + ABORT_NOT_OK(builder.Append(str)); + } + return builder.Finish(); +} + +std::vector Expand(const std::vector& base, size_t n) { + std::vector result; + result.reserve(n); + + while (true) { + for (const auto& v : base) { + result.push_back(v); + if (result.size() == n) { + return result; + } + } + } +} + +static void BenchmarkTrieLookups(benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + Trie trie = MakeNullsTrie(); + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += trie.Find(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void BenchmarkInlinedTrieLookups( + benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += InlinedNullLookup(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void BM_TrieLookupFound(benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void BM_TrieLookupNotFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +static void BM_InlinedTrieLookupFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkInlinedTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void BM_InlinedTrieLookupNotFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkInlinedTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +static const int kRepetitions = 2; + +BENCHMARK(BM_TrieLookupFound)->Repetitions(kRepetitions); +BENCHMARK(BM_TrieLookupNotFound)->Repetitions(kRepetitions); +BENCHMARK(BM_InlinedTrieLookupFound)->Repetitions(kRepetitions); +BENCHMARK(BM_InlinedTrieLookupNotFound)->Repetitions(kRepetitions); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie-test.cc b/cpp/src/arrow/util/trie-test.cc new file mode 100644 index 0000000000000..33eefa9d9335f --- /dev/null +++ b/cpp/src/arrow/util/trie-test.cc @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/test-util.h" +#include "arrow/util/trie.h" + +namespace arrow { +namespace internal { + +TEST(SmallString, Basics) { + using SS = SmallString<5>; + { + SS s; + ASSERT_EQ(s.length(), 0); + ASSERT_EQ(util::string_view(s), util::string_view("")); + ASSERT_EQ(s, ""); + ASSERT_NE(s, "x"); + ASSERT_EQ(sizeof(s), 6); + } + { + SS s("abc"); + ASSERT_EQ(s.length(), 3); + ASSERT_EQ(util::string_view(s), util::string_view("abc")); + ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0); + ASSERT_EQ(s, "abc"); + ASSERT_NE(s, "ab"); + } +} + +TEST(SmallString, Assign) { + using SS = SmallString<5>; + auto s = SS(); + + s = util::string_view("abc"); + ASSERT_EQ(s.length(), 3); + ASSERT_EQ(util::string_view(s), util::string_view("abc")); + ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0); + ASSERT_EQ(s, "abc"); + ASSERT_NE(s, "ab"); + + s = std::string("ghijk"); + ASSERT_EQ(s.length(), 5); + ASSERT_EQ(util::string_view(s), util::string_view("ghijk")); + ASSERT_EQ(std::memcmp(s.data(), "ghijk", 5), 0); + ASSERT_EQ(s, "ghijk"); + ASSERT_NE(s, ""); + + s = SS("xy"); + ASSERT_EQ(s.length(), 2); + ASSERT_EQ(util::string_view(s), util::string_view("xy")); + ASSERT_EQ(std::memcmp(s.data(), "xy", 2), 0); + ASSERT_EQ(s, "xy"); + ASSERT_NE(s, "xyz"); +} + +TEST(SmallString, Substr) { + using SS = SmallString<5>; + { + auto s = SS(); + ASSERT_EQ(s.substr(0), ""); + ASSERT_EQ(s.substr(0, 2), ""); + } + { + auto s = SS("abcd"); + ASSERT_EQ(s.substr(0), "abcd"); + ASSERT_EQ(s.substr(1), "bcd"); + ASSERT_EQ(s.substr(4), ""); + ASSERT_EQ(s.substr(0, 0), ""); + ASSERT_EQ(s.substr(0, 3), "abc"); + ASSERT_EQ(s.substr(0, 4), "abcd"); + ASSERT_EQ(s.substr(1, 0), ""); + ASSERT_EQ(s.substr(1, 2), "bc"); + ASSERT_EQ(s.substr(4, 0), ""); + ASSERT_EQ(s.substr(4, 1), ""); + } +} + +static std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +static void TestTrieContents(const Trie& trie, const std::vector& entries) { + std::unordered_map control; + auto n_entries = static_cast(entries.size()); + + // Build control container + for (int32_t i = 0; i < n_entries; ++i) { + auto p = control.insert({entries[i], i}); + ASSERT_TRUE(p.second); + } + + // Check all existing entries in trie + for (int32_t i = 0; i < n_entries; ++i) { + ASSERT_EQ(i, trie.Find(entries[i])) << "for string '" << entries[i] << "'"; + } + + auto CheckNotExists = [&control, &trie](const std::string& s) { + auto p = control.find(s); + if (p == control.end()) { + ASSERT_EQ(-1, trie.Find(s)) << "for string '" << s << "'"; + } + }; + + // Check potentially non-existing strings + CheckNotExists(""); + CheckNotExists("X"); + CheckNotExists("abcdefxxxxxxxxxxxxxxx"); + + // Check potentially non-existing variations of existing entries + for (const auto& e : entries) { + CheckNotExists(e + "X"); + if (e.size() > 0) { + CheckNotExists(e.substr(0, 1)); + auto prefix = e.substr(0, e.size() - 1); + CheckNotExists(prefix); + CheckNotExists(prefix + "X"); + auto split_at = e.size() / 2; + CheckNotExists(e.substr(0, split_at) + 'x' + e.substr(split_at + 1)); + } + } +} + +static void TestTrieContents(const std::vector& entries) { + TrieBuilder builder; + for (const auto& s : entries) { + ASSERT_OK(builder.Append(s)); + } + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + TestTrieContents(trie, entries); +} + +TEST(Trie, Empty) { + TrieBuilder builder; + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + ASSERT_EQ(-1, trie.Find("")); + ASSERT_EQ(-1, trie.Find("x")); +} + +TEST(Trie, EmptyString) { + TrieBuilder builder; + ASSERT_OK(builder.Append("")); + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + ASSERT_EQ(0, trie.Find("")); + ASSERT_EQ(-1, trie.Find("x")); +} + +TEST(Trie, Basics1) { + TestTrieContents({"abc", "de", "f"}); + TestTrieContents({"abc", "de", "f", ""}); +} + +TEST(Trie, Basics2) { + TestTrieContents({"a", "abc", "abcd", "abcdef"}); + TestTrieContents({"", "a", "abc", "abcd", "abcdef"}); +} + +TEST(Trie, Basics3) { + TestTrieContents({"abcd", "ab", "a"}); + TestTrieContents({"abcd", "ab", "a", ""}); +} + +TEST(Trie, LongStrings) { + TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "defghijklmnopqrst"}); + TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "abcde"}); +} + +TEST(Trie, NullChars) { + const std::string empty; + const std::string nul(1, '\x00'); + std::string a, b, c, d; + a = "x" + nul + "y"; + b = "x" + nul + "z"; + c = nul + "y"; + d = nul; + ASSERT_EQ(a.length(), 3); + ASSERT_EQ(d.length(), 1); + + TestTrieContents({a, b, c, d}); + TestTrieContents({a, b, c}); + TestTrieContents({a, b, c, d, ""}); + TestTrieContents({a, b, c, ""}); + TestTrieContents({d, c, b, a}); + TestTrieContents({c, b, a}); + TestTrieContents({d, c, b, a, ""}); + TestTrieContents({c, b, a, ""}); +} + +TEST(Trie, NegativeChars) { + // Test with characters >= 0x80 (to check the absence of sign issues) + TestTrieContents({"\x7f\x80\x81\xff", "\x7f\x80\x81", "\x7f\xff\x81", "\xff\x80\x81"}); +} + +TEST(Trie, CSVNulls) { TestTrieContents(AllNulls()); } + +TEST(Trie, Duplicates) { + { + TrieBuilder builder; + ASSERT_OK(builder.Append("ab")); + ASSERT_OK(builder.Append("abc")); + ASSERT_RAISES(Invalid, builder.Append("abc")); + ASSERT_OK(builder.Append("abcd")); + ASSERT_RAISES(Invalid, builder.Append("ab")); + ASSERT_OK(builder.Append("abcde")); + const Trie trie = builder.Finish(); + + TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"}); + } + { + // With allow_duplicates = true + TrieBuilder builder; + ASSERT_OK(builder.Append("ab", true)); + ASSERT_OK(builder.Append("abc", true)); + ASSERT_OK(builder.Append("abc", true)); + ASSERT_OK(builder.Append("abcd", true)); + ASSERT_OK(builder.Append("ab", true)); + ASSERT_OK(builder.Append("abcde", true)); + const Trie trie = builder.Finish(); + + TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"}); + } +} + +TEST(Trie, CapacityError) { + // A trie uses 16-bit indices into various internal structures and + // therefore has limited size available. + TrieBuilder builder; + uint8_t first, second, third; + bool had_capacity_error = false; + uint8_t s[] = "\x00\x00\x00\x00"; + + for (first = 1; first < 125; ++first) { + s[0] = first; + for (second = 1; second < 125; ++second) { + s[1] = second; + for (third = 1; third < 125; ++third) { + s[2] = third; + auto st = builder.Append(reinterpret_cast(s)); + if (st.IsCapacityError()) { + DCHECK_GE(first, 2); + had_capacity_error = true; + break; + } else { + ASSERT_OK(st); + } + } + } + } + ASSERT_TRUE(had_capacity_error) << "Should have produced CapacityError"; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie.cc b/cpp/src/arrow/util/trie.cc new file mode 100644 index 0000000000000..eaa02b7c5352e --- /dev/null +++ b/cpp/src/arrow/util/trie.cc @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/trie.h" + +#include +#include + +#include "arrow/util/logging.h" + +namespace arrow { +namespace internal { + +Status Trie::Validate() const { + const auto n_nodes = static_cast(nodes_.size()); + if (size_ > n_nodes) { + return Status::Invalid("Number of entries larger than number of nodes"); + } + for (const auto& node : nodes_) { + if (node.found_index_ >= size_) { + return Status::Invalid("Found index >= size"); + } + if (node.child_lookup_ != -1 && + node.child_lookup_ * 256 > + static_cast(lookup_table_.size() - 256)) { + return Status::Invalid("Child lookup base doesn't point to 256 valid indices"); + } + } + for (const auto index : lookup_table_) { + if (index >= n_nodes) { + return Status::Invalid("Child lookup index out of bounds"); + } + } + return Status::OK(); +} + +void Trie::Dump(const Node* node, const std::string& indent) const { + std::cerr << "[\"" << node->substring_ << "\"]"; + if (node->found_index_ >= 0) { + std::cerr << " *"; + } + std::cerr << "\n"; + if (node->child_lookup_ >= 0) { + auto child_indent = indent + " "; + std::cerr << child_indent << "|\n"; + for (fast_index_type i = 0; i < 256; ++i) { + auto child_index = lookup_table_[node->child_lookup_ * 256 + i]; + if (child_index >= 0) { + const Node* child = &nodes_[child_index]; + std::cerr << child_indent << "|-> '" << static_cast(i) << "' (" << i + << ") -> "; + Dump(child, child_indent); + } + } + } +} + +void Trie::Dump() const { Dump(&nodes_[0], ""); } + +TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); } + +Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) { + if (parent->child_lookup_ == -1) { + RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_)); + } + auto parent_lookup = parent->child_lookup_ * 256 + ch; + + DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1); + if (trie_.nodes_.size() >= static_cast(kMaxIndex)) { + return Status::CapacityError("Trie out of bounds"); + } + trie_.nodes_.push_back(std::move(node)); + trie_.lookup_table_[parent_lookup] = static_cast(trie_.nodes_.size() - 1); + return Status::OK(); +} + +Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch, + util::string_view substring) { + const auto kMaxSubstringLength = Trie::kMaxSubstringLength; + + while (substring.length() > kMaxSubstringLength) { + // Substring doesn't fit in node => create intermediate node + auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)}; + RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node))); + // Recurse + parent = &trie_.nodes_.back(); + ch = static_cast(substring[kMaxSubstringLength]); + substring = substring.substr(kMaxSubstringLength + 1); + } + + // Create final matching node + auto child_node = Trie::Node{trie_.size_, -1, substring}; + RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node))); + ++trie_.size_; + return Status::OK(); +} + +Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch, + util::string_view substring) { + return CreateChildNode(parent, static_cast(ch), substring); +} + +Status TrieBuilder::ExtendLookupTable(index_type* out_index) { + auto cur_size = trie_.lookup_table_.size(); + auto cur_index = cur_size / 256; + if (cur_index > static_cast(kMaxIndex)) { + return Status::CapacityError("Trie out of bounds"); + } + trie_.lookup_table_.resize(cur_size + 256, -1); + *out_index = static_cast(cur_index); + return Status::OK(); +} + +Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) { + Trie::Node* node = &trie_.nodes_[node_index]; + + DCHECK_LT(split_at, node->substring_length()); + + // Before: + // {node} -> [...] + // After: + // {node} -> [c] -> {out_node} -> [...] + auto child_node = Trie::Node{node->found_index_, node->child_lookup_, + node->substring_.substr(split_at + 1)}; + auto ch = node->substring_[split_at]; + node->child_lookup_ = -1; + node->found_index_ = -1; + node->substring_ = node->substring_.substr(0, split_at); + RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node))); + + return Status::OK(); +} + +Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) { + // Find or create node for string + fast_index_type node_index = 0; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (true) { + Trie::Node* node = &trie_.nodes_[node_index]; + const auto substring_length = node->substring_length(); + const auto substring_data = node->substring_data(); + + for (fast_index_type i = 0; i < substring_length; ++i) { + if (remaining == 0) { + // New string too short => need to split node + RETURN_NOT_OK(SplitNode(node_index, i)); + // Current node matches exactly + node = &trie_.nodes_[node_index]; + node->found_index_ = trie_.size_++; + return Status::OK(); + } + if (s[pos] != substring_data[i]) { + // Mismatching substring => need to split node + RETURN_NOT_OK(SplitNode(node_index, i)); + // Create new node for mismatching char + node = &trie_.nodes_[node_index]; + return CreateChildNode(node, s[pos], s.substr(pos + 1)); + } + ++pos; + --remaining; + } + if (remaining == 0) { + // Node matches exactly + if (node->found_index_ >= 0) { + if (allow_duplicate) { + return Status::OK(); + } else { + return Status::Invalid("Duplicate entry in trie"); + } + } + node->found_index_ = trie_.size_++; + return Status::OK(); + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Need to extend lookup table for this node + RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_)); + } + auto c = static_cast(s[pos++]); + --remaining; + node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c]; + if (node_index == -1) { + // Child not found => need to create child node + return CreateChildNode(node, c, s.substr(pos)); + } + node = &trie_.nodes_[node_index]; + } +} + +Trie TrieBuilder::Finish() { return std::move(trie_); } + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie.h b/cpp/src/arrow/util/trie.h new file mode 100644 index 0000000000000..3e82bfd8ee28f --- /dev/null +++ b/cpp/src/arrow/util/trie.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TRIE_H +#define ARROW_UTIL_TRIE_H + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// A non-zero-terminated small string class. +// std::string usually has a small string optimization +// (see review at https://shaharmike.com/cpp/std-string/) +// but this one allows tight control and optimization of memory layout. +template +class SmallString { + public: + SmallString() : length_(0) {} + + template + SmallString(const T& v) { // NOLINT implicit constructor + *this = util::string_view(v); + } + + SmallString& operator=(const util::string_view s) { +#ifndef NDEBUG + CheckSize(s.size()); +#endif + length_ = static_cast(s.size()); + std::memcpy(data_, s.data(), length_); + return *this; + } + + SmallString& operator=(const std::string& s) { + *this = util::string_view(s); + return *this; + } + + SmallString& operator=(const char* s) { + *this = util::string_view(s); + return *this; + } + + explicit operator util::string_view() const { + return util::string_view(data_, length_); + } + + const char* data() const { return data_; } + size_t length() const { return length_; } + bool empty() const { return length_ == 0; } + char operator[](size_t pos) const { +#ifdef NDEBUG + assert(pos <= length_); +#endif + return data_[pos]; + } + + SmallString substr(size_t pos) const { + return SmallString(util::string_view(*this).substr(pos)); + } + + SmallString substr(size_t pos, size_t count) const { + return SmallString(util::string_view(*this).substr(pos, count)); + } + + template + bool operator==(T&& other) const { + return util::string_view(*this) == util::string_view(std::forward(other)); + } + + template + bool operator!=(T&& other) const { + return util::string_view(*this) != util::string_view(std::forward(other)); + } + + protected: + uint8_t length_; + char data_[N]; + +#ifndef NDEBUG + void CheckSize(size_t n) { assert(n <= N); } +#endif +}; + +template +std::ostream& operator<<(std::ostream& os, const SmallString& str) { + return os << util::string_view(str); +} + +// A trie class for byte strings, optimized for small sets of short strings. +// This class is immutable by design, use a TrieBuilder to construct it. +class ARROW_EXPORT Trie { + using index_type = int16_t; + using fast_index_type = int_fast16_t; + + public: + Trie() : size_(0) {} + Trie(Trie&&) = default; + Trie& operator=(Trie&&) = default; + + int32_t Find(util::string_view s) const { + const Node* node = &nodes_[0]; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (remaining > 0) { + auto substring_length = node->substring_length(); + if (substring_length > 0) { + auto substring_data = node->substring_data(); + if (remaining < substring_length) { + // Input too short + return -1; + } + for (fast_index_type i = 0; i < substring_length; ++i) { + if (s[pos++] != substring_data[i]) { + // Mismatching substring + return -1; + } + --remaining; + } + if (remaining == 0) { + // Matched node exactly + return node->found_index_; + } + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Input too long + return -1; + } + auto c = static_cast(s[pos++]); + --remaining; + auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; + if (child_index == -1) { + // Child not found + return -1; + } + node = &nodes_[child_index]; + } + + // Input exhausted + if (node->substring_.empty()) { + // Matched node exactly + return node->found_index_; + } else { + return -1; + } + } + + Status Validate() const; + + void Dump() const; + + protected: + static constexpr size_t kNodeSize = 16; + static constexpr auto kMaxSubstringLength = + kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); + + struct Node { + // If this node is a valid end of string, index of found string, otherwise -1 + index_type found_index_; + // Base index for child lookup in lookup_table_ (-1 if no child nodes) + index_type child_lookup_; + // The substring for this node. + SmallString substring_; + + fast_index_type substring_length() const { + return static_cast(substring_.length()); + } + const char* substring_data() const { return substring_.data(); } + }; + + static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); + + ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); + + void Dump(const Node* node, const std::string& indent) const; + + // Node table: entry 0 is the root node + std::vector nodes_; + + // Indexed lookup structure: gives index in node table, or -1 if not found + std::vector lookup_table_; + + // Number of entries + index_type size_; + + friend class TrieBuilder; +}; + +class ARROW_EXPORT TrieBuilder { + using index_type = Trie::index_type; + using fast_index_type = Trie::fast_index_type; + + public: + TrieBuilder(); + Status Append(util::string_view s, bool allow_duplicate = false); + Trie Finish(); + + protected: + // Extend the lookup table by 256 entries, return the index of the new span + Status ExtendLookupTable(index_type* out_lookup_index); + // Split the node given by the index at the substring index `split_at` + Status SplitNode(fast_index_type node_index, fast_index_type split_at); + // Append an already constructed child node to the parent + Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); + // Create a matching child node from this parent + Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); + Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); + + Trie trie_; + + static constexpr auto kMaxIndex = std::numeric_limits::max(); +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_TRIE_H diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index f5a18be05a92f..072c2188f7081 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -24,6 +24,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -157,6 +158,13 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); } +inline bool ValidateUTF8(const util::string_view& str) { + const uint8_t* data = reinterpret_cast(str.data()); + const size_t length = str.size(); + + return ValidateUTF8(data, length); +} + } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/variant.h b/cpp/src/arrow/util/variant.h index 1aa9aa3732fdf..fecaa5107c660 100644 --- a/cpp/src/arrow/util/variant.h +++ b/cpp/src/arrow/util/variant.h @@ -1,1105 +1,35 @@ -// Copyright (c) MapBox -// All rights reserved. +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at // -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: +// http://www.apache.org/licenses/LICENSE-2.0 // -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. #ifndef ARROW_UTIL_VARIANT_H #define ARROW_UTIL_VARIANT_H -#include -#include // size_t -#include // operator new -#include // runtime_error -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - - -#ifdef _MSC_VER -// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx -# ifdef NDEBUG -# define VARIANT_INLINE __forceinline -# else -# define VARIANT_INLINE //__declspec(noinline) -# endif -#else -# ifdef NDEBUG -# define VARIANT_INLINE //inline __attribute__((always_inline)) -# else -# define VARIANT_INLINE __attribute__((noinline)) -# endif -#endif -// clang-format on - -// Exceptions -#if defined( __EXCEPTIONS) || defined( _MSC_VER) -#define HAS_EXCEPTIONS -#endif - -#define VARIANT_MAJOR_VERSION 1 -#define VARIANT_MINOR_VERSION 1 -#define VARIANT_PATCH_VERSION 0 - -#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) +#include "arrow/vendored/variant/variant.hpp" // IWYU pragma: export +#include "arrow/vendored/variant/variant_io.hpp" namespace arrow { namespace util { -// XXX This should derive from std::logic_error instead of std::runtime_error. -// See https://github.com/mapbox/variant/issues/48 for details. -class bad_variant_access : public std::runtime_error -{ - -public: - explicit bad_variant_access(const std::string& what_arg) - : runtime_error(what_arg) {} - - explicit bad_variant_access(const char* what_arg) - : runtime_error(what_arg) {} - -}; // class bad_variant_access - -#if !defined(ARROW_VARIANT_MINIMIZE_SIZE) -using type_index_t = std::size_t; -#else -#if defined(ARROW_VARIANT_OPTIMIZE_FOR_SPEED) -using type_index_t = std::uint_fast8_t; -#else -using type_index_t = std::uint_least8_t; -#endif -#endif - -namespace detail { - -static constexpr type_index_t invalid_value = type_index_t(-1); - -template -struct direct_type; - -template -struct direct_type -{ - static constexpr type_index_t index = std::is_same::value - ? sizeof...(Types) - : direct_type::index; -}; - -template -struct direct_type -{ - static constexpr type_index_t index = invalid_value; -}; - -#if __cpp_lib_logical_traits >= 201510L - -using std::conjunction; -using std::disjunction; - -#else - -template -struct conjunction : std::true_type {}; - -template -struct conjunction : B1 {}; - -template -struct conjunction : std::conditional::type {}; - -template -struct conjunction : std::conditional, B1>::type {}; - -template -struct disjunction : std::false_type {}; - -template -struct disjunction : B1 {}; - -template -struct disjunction : std::conditional::type {}; - -template -struct disjunction : std::conditional>::type {}; - -#endif - -template -struct convertible_type; - -template -struct convertible_type -{ - static constexpr type_index_t index = std::is_convertible::value - ? disjunction...>::value ? invalid_value : sizeof...(Types) - : convertible_type::index; -}; - -template -struct convertible_type -{ - static constexpr type_index_t index = invalid_value; -}; - -template -struct value_traits -{ - using value_type = typename std::remove_const::type>::type; - using value_type_wrapper = recursive_wrapper; - static constexpr type_index_t direct_index = direct_type::index; - static constexpr bool is_direct = direct_index != invalid_value; - static constexpr type_index_t index_direct_or_wrapper = is_direct ? direct_index : direct_type::index; - static constexpr bool is_direct_or_wrapper = index_direct_or_wrapper != invalid_value; - static constexpr type_index_t index = is_direct_or_wrapper ? index_direct_or_wrapper : convertible_type::index; - static constexpr bool is_valid = index != invalid_value; - static constexpr type_index_t tindex = is_valid ? sizeof...(Types)-index : 0; - using target_type = typename std::tuple_element>::type; -}; - -template -struct enable_if_type -{ - using type = R; -}; - -template -struct result_of_unary_visit -{ - using type = typename std::result_of::type; -}; - -template -struct result_of_unary_visit::type> -{ - using type = typename F::result_type; -}; - -template -struct result_of_binary_visit -{ - using type = typename std::result_of::type; -}; - -template -struct result_of_binary_visit::type> -{ - using type = typename F::result_type; -}; - -template -struct static_max; - -template -struct static_max -{ - static const type_index_t value = arg; -}; - -template -struct static_max -{ - static const type_index_t value = arg1 >= arg2 ? static_max::value : static_max::value; -}; - -template -struct variant_helper; - -template -struct variant_helper -{ - VARIANT_INLINE static void destroy(const type_index_t type_index, void* data) - { - if (type_index == sizeof...(Types)) - { - reinterpret_cast(data)->~T(); - } - else - { - variant_helper::destroy(type_index, data); - } - } - - VARIANT_INLINE static void move(const type_index_t old_type_index, void* old_value, void* new_value) - { - if (old_type_index == sizeof...(Types)) - { - new (new_value) T(std::move(*reinterpret_cast(old_value))); - } - else - { - variant_helper::move(old_type_index, old_value, new_value); - } - } - - VARIANT_INLINE static void copy(const type_index_t old_type_index, const void* old_value, void* new_value) - { - if (old_type_index == sizeof...(Types)) - { - new (new_value) T(*reinterpret_cast(old_value)); - } - else - { - variant_helper::copy(old_type_index, old_value, new_value); - } - } -}; - -template <> -struct variant_helper<> -{ - VARIANT_INLINE static void destroy(const type_index_t, void*) {} - VARIANT_INLINE static void move(const type_index_t, void*, void*) {} - VARIANT_INLINE static void copy(const type_index_t, const void*, void*) {} -}; - -template -struct unwrapper -{ - static T const& apply_const(T const& obj) { return obj; } - static T& apply(T& obj) { return obj; } -}; - -template -struct unwrapper> -{ - static auto apply_const(recursive_wrapper const& obj) - -> typename recursive_wrapper::type const& - { - return obj.get(); - } - static auto apply(recursive_wrapper& obj) - -> typename recursive_wrapper::type& - { - return obj.get(); - } -}; - -template -struct unwrapper> -{ - static auto apply_const(std::reference_wrapper const& obj) - -> typename std::reference_wrapper::type const& - { - return obj.get(); - } - static auto apply(std::reference_wrapper& obj) - -> typename std::reference_wrapper::type& - { - return obj.get(); - } -}; - -template -struct dispatcher; - -template -struct dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v, F&& f) - { - if (v.template is()) - { - return f(unwrapper::apply_const(v.template get_unchecked())); - } - else - { - return dispatcher::apply_const(v, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& v, F&& f) - { - if (v.template is()) - { - return f(unwrapper::apply(v.template get_unchecked())); - } - else - { - return dispatcher::apply(v, std::forward(f)); - } - } -}; - -template -struct dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v, F&& f) - { - return f(unwrapper::apply_const(v.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& v, F&& f) - { - return f(unwrapper::apply(v.template get_unchecked())); - } -}; - -template -struct binary_dispatcher_rhs; - -template -struct binary_dispatcher_rhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - if (rhs.template is()) // call binary functor - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - if (rhs.template is()) // call binary functor - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); - } - } -}; - -template -struct binary_dispatcher_rhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } -}; - -template -struct binary_dispatcher_lhs; - -template -struct binary_dispatcher_lhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - if (lhs.template is()) // call binary functor - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - if (lhs.template is()) // call binary functor - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); - } - } -}; - -template -struct binary_dispatcher_lhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } -}; - -template -struct binary_dispatcher; - -template -struct binary_dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) - { - if (v0.template is()) - { - if (v1.template is()) - { - return f(unwrapper::apply_const(v0.template get_unchecked()), - unwrapper::apply_const(v1.template get_unchecked())); // call binary functor - } - else - { - return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); - } - } - else if (v1.template is()) - { - return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); - } - return binary_dispatcher::apply_const(v0, v1, std::forward(f)); - } - - VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) - { - if (v0.template is()) - { - if (v1.template is()) - { - return f(unwrapper::apply(v0.template get_unchecked()), - unwrapper::apply(v1.template get_unchecked())); // call binary functor - } - else - { - return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); - } - } - else if (v1.template is()) - { - return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); - } - return binary_dispatcher::apply(v0, v1, std::forward(f)); - } -}; - -template -struct binary_dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) - { - return f(unwrapper::apply_const(v0.template get_unchecked()), - unwrapper::apply_const(v1.template get_unchecked())); // call binary functor - } - - VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) - { - return f(unwrapper::apply(v0.template get_unchecked()), - unwrapper::apply(v1.template get_unchecked())); // call binary functor - } -}; - -// comparator functors -struct equal_comp -{ - template - bool operator()(T const& lhs, T const& rhs) const - { - return lhs == rhs; - } -}; - -struct less_comp -{ - template - bool operator()(T const& lhs, T const& rhs) const - { - return lhs < rhs; - } -}; - -template -class comparer -{ -public: - explicit comparer(Variant const& lhs) noexcept - : lhs_(lhs) {} - comparer& operator=(comparer const&) = delete; - // visitor - template - bool operator()(T const& rhs_content) const - { - T const& lhs_content = lhs_.template get_unchecked(); - return Comp()(lhs_content, rhs_content); - } - -private: - Variant const& lhs_; -}; - -// hashing visitor -struct hasher -{ - template - std::size_t operator()(const T& hashable) const - { - return std::hash{}(hashable); - } -}; - -} // namespace detail - -struct no_init {}; - -template -class variant -{ - static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty."); - static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); - static_assert(!detail::disjunction...>::value, "Variant can not hold array types."); - static_assert(sizeof...(Types) < std::numeric_limits::max(), "Internal index type must be able to accommodate all alternatives."); -private: - static const std::size_t data_size = detail::static_max::value; - static const std::size_t data_align = detail::static_max::value; -public: - struct adapted_variant_tag; - using types = std::tuple; -private: - using first_type = typename std::tuple_element<0, types>::type; - using data_type = typename std::aligned_storage::type; - using helper_type = detail::variant_helper; - - type_index_t type_index; - data_type data; - -public: - VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) - : type_index(sizeof...(Types)-1) - { - static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant."); - new (&data) first_type(); - } - - VARIANT_INLINE variant(no_init) noexcept - : type_index(detail::invalid_value) {} - - // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers - template , - typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > - VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) - : type_index(Traits::index) - { - new (&data) typename Traits::target_type(std::forward(val)); - } - - VARIANT_INLINE variant(variant const& old) - : type_index(old.type_index) - { - helper_type::copy(old.type_index, &old.data, &data); - } - - VARIANT_INLINE variant(variant&& old) - noexcept(detail::conjunction...>::value) - : type_index(old.type_index) - { - helper_type::move(old.type_index, &old.data, &data); - } - -private: - VARIANT_INLINE void copy_assign(variant const& rhs) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - helper_type::copy(rhs.type_index, &rhs.data, &data); - type_index = rhs.type_index; - } - - VARIANT_INLINE void move_assign(variant&& rhs) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - helper_type::move(rhs.type_index, &rhs.data, &data); - type_index = rhs.type_index; - } - -public: - VARIANT_INLINE variant& operator=(variant&& other) - { - move_assign(std::move(other)); - return *this; - } - - VARIANT_INLINE variant& operator=(variant const& other) - { - copy_assign(other); - return *this; - } - - // conversions - // move-assign - template - VARIANT_INLINE variant& operator=(T&& rhs) noexcept - { - variant temp(std::forward(rhs)); - move_assign(std::move(temp)); - return *this; - } - - // copy-assign - template - VARIANT_INLINE variant& operator=(T const& rhs) - { - variant temp(rhs); - copy_assign(temp); - return *this; - } - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE bool is() const - { - return type_index == detail::direct_type::index; - } - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE bool is() const - { - return type_index == detail::direct_type, Types...>::index; - } - - VARIANT_INLINE bool valid() const - { - return type_index != detail::invalid_value; - } - - template - VARIANT_INLINE void set(Args&&... args) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - new (&data) T(std::forward(args)...); - type_index = detail::direct_type::index; - } - - // get_unchecked() - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return *reinterpret_cast(&data); - } - -#ifdef HAS_EXCEPTIONS - // get() - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type::index) - { - return *reinterpret_cast(&data); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return *reinterpret_cast(&data); - } - -#ifdef HAS_EXCEPTIONS - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type::index) - { - return *reinterpret_cast(&data); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // get_unchecked() - T stored as recursive_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return (*reinterpret_cast*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - // get() - T stored as recursive_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return (*reinterpret_cast const*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast const*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // get_unchecked() - T stored as std::reference_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return (*reinterpret_cast*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - // get() - T stored as std::reference_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return (*reinterpret_cast const*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast const*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // This function is deprecated because it returns an internal index field. - // Use which() instead. - ARROW_DEPRECATED("Use which() instead") - VARIANT_INLINE type_index_t get_type_index() const - { - return type_index; - } - - VARIANT_INLINE int which() const noexcept - { - return static_cast(sizeof...(Types) - type_index - 1); - } - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE static constexpr int which() noexcept - { - return static_cast(sizeof...(Types)-detail::direct_type::index - 1); - } - - // visitor - // unary - template ::type> - auto VARIANT_INLINE static visit(V const& v, F&& f) - -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) - { - return detail::dispatcher::apply_const(v, std::forward(f)); - } - // non-const - template ::type> - auto VARIANT_INLINE static visit(V& v, F&& f) - -> decltype(detail::dispatcher::apply(v, std::forward(f))) - { - return detail::dispatcher::apply(v, std::forward(f)); - } - - // binary - // const - template ::type> - auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) - -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) - { - return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); - } - // non-const - template ::type> - auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) - -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) - { - return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); - } - - // match - // unary - template - auto VARIANT_INLINE match(Fs&&... fs) const - -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) - { - return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); - } - // non-const - template - auto VARIANT_INLINE match(Fs&&... fs) - -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) - { - return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); - } - - ~variant() noexcept // no-throw destructor - { - helper_type::destroy(type_index, &data); - } - - // comparison operators - // equality - VARIANT_INLINE bool operator==(variant const& rhs) const - { - assert(valid() && rhs.valid()); - if (this->which() != rhs.which()) - { - return false; - } - detail::comparer visitor(*this); - return visit(rhs, visitor); - } - - VARIANT_INLINE bool operator!=(variant const& rhs) const - { - return !(*this == rhs); - } - - // less than - VARIANT_INLINE bool operator<(variant const& rhs) const - { - assert(valid() && rhs.valid()); - if (this->which() != rhs.which()) - { - return this->which() < rhs.which(); - } - detail::comparer visitor(*this); - return visit(rhs, visitor); - } - VARIANT_INLINE bool operator>(variant const& rhs) const - { - return rhs < *this; - } - VARIANT_INLINE bool operator<=(variant const& rhs) const - { - return !(*this > rhs); - } - VARIANT_INLINE bool operator>=(variant const& rhs) const - { - return !(*this < rhs); - } -}; - -// unary visitor interface -// const -template -auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) -{ - return V::visit(v, std::forward(f)); -} - -// non-const -template -auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) -{ - return V::visit(v, std::forward(f)); -} - -// binary visitor interface -// const -template -auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) -{ - return V::binary_visit(v0, v1, std::forward(f)); -} - -// non-const -template -auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) -{ - return V::binary_visit(v0, v1, std::forward(f)); -} - -// getter interface - -#ifdef HAS_EXCEPTIONS -template -auto get(T& var)->decltype(var.template get()) -{ - return var.template get(); -} -#endif - -template -ResultType& get_unchecked(T& var) -{ - return var.template get_unchecked(); -} - -#ifdef HAS_EXCEPTIONS -template -auto get(T const& var)->decltype(var.template get()) -{ - return var.template get(); -} -#endif - -template -ResultType const& get_unchecked(T const& var) -{ - return var.template get_unchecked(); -} -// variant_size -template -struct variant_size; - -//variable templates is c++14 -//template -//constexpr std::size_t variant_size_v = variant_size::value; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size> - : std::integral_constant {}; - -// variant_alternative -template -struct variant_alternative; - -#if defined(__clang__) -#if __has_builtin(__type_pack_element) -#define has_type_pack_element -#endif -#endif - -#if defined(has_type_pack_element) -template -struct variant_alternative> -{ - static_assert(sizeof...(Types) > Index , "Index out of range"); - using type = __type_pack_element; -}; -#else -template -struct variant_alternative> - : variant_alternative> -{ - static_assert(sizeof...(Types) > Index -1 , "Index out of range"); -}; - -template -struct variant_alternative<0, variant> -{ - using type = First; -}; - -#endif - -template -using variant_alternative_t = typename variant_alternative::type; - -template -struct variant_alternative - : std::add_const> {}; - -template -struct variant_alternative - : std::add_volatile> {}; - -template -struct variant_alternative - : std::add_cv> {}; +using mapbox::util::apply_visitor; // seems akin to std::visit +using mapbox::util::bad_variant_access; +using mapbox::util::get; +using mapbox::util::variant; -} // namespace util -} // namespace arrow +} // namespace util +} // namespace arrow -#endif // ARROW_UTIL_VARIANT_H +#endif // ARROW_UTIL_VARIANT_H diff --git a/cpp/src/arrow/util/variant/optional.h b/cpp/src/arrow/util/variant/optional.h deleted file mode 100644 index 4c6671061fe80..0000000000000 --- a/cpp/src/arrow/util/variant/optional.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_OPTIONAL_H -#define ARROW_UTIL_VARIANT_OPTIONAL_H - -#pragma message("This implementation of optional is deprecated. See https://github.com/mapbox/variant/issues/64.") - -#include -#include - -#include - -namespace arrow { -namespace util { - -template -class optional -{ - static_assert(!std::is_reference::value, "optional doesn't support references"); - - struct none_type - { - }; - - variant variant_; - -public: - optional() = default; - - optional(optional const& rhs) - { - if (this != &rhs) - { // protect against invalid self-assignment - variant_ = rhs.variant_; - } - } - - optional(T const& v) { variant_ = v; } - - explicit operator bool() const noexcept { return variant_.template is(); } - - T const& get() const { return variant_.template get(); } - T& get() { return variant_.template get(); } - - T const& operator*() const { return this->get(); } - T operator*() { return this->get(); } - - optional& operator=(T const& v) - { - variant_ = v; - return *this; - } - - optional& operator=(optional const& rhs) - { - if (this != &rhs) - { - variant_ = rhs.variant_; - } - return *this; - } - - template - void emplace(Args&&... args) - { - variant_ = T{std::forward(args)...}; - } - - void reset() { variant_ = none_type{}; } - -}; // class optional - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_OPTIONAL_H diff --git a/cpp/src/arrow/util/variant/variant_cast.h b/cpp/src/arrow/util/variant/variant_cast.h deleted file mode 100644 index 71ae80b5dfab6..0000000000000 --- a/cpp/src/arrow/util/variant/variant_cast.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_CAST_H -#define ARROW_UTIL_VARIANT_CAST_H - -#include - -#include "arrow/util/macros.h" - -namespace arrow { -namespace util { - -namespace detail { - -template -class static_caster -{ -public: - template - T& operator()(V& v) const - { - return static_cast(v); - } -}; - -template -class dynamic_caster -{ -public: - using result_type = T&; - template - T& operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - throw std::bad_cast(); - } - template - T& operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return dynamic_cast(v); - } -}; - -template -class dynamic_caster -{ -public: - using result_type = T*; - template - T* operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return NULLPTR; - } - template - T* operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return dynamic_cast(&v); - } -}; -} - -template -typename detail::dynamic_caster::result_type -dynamic_variant_cast(V& v) -{ - return arrow::util::apply_visitor(detail::dynamic_caster(), v); -} - -template -typename detail::dynamic_caster::result_type -dynamic_variant_cast(const V& v) -{ - return arrow::util::apply_visitor(detail::dynamic_caster(), v); -} - -template -T& static_variant_cast(V& v) -{ - return arrow::util::apply_visitor(detail::static_caster(), v); -} - -template -const T& static_variant_cast(const V& v) -{ - return arrow::util::apply_visitor(detail::static_caster(), v); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_CAST_H diff --git a/cpp/src/arrow/util/variant/variant_io.h b/cpp/src/arrow/util/variant/variant_io.h deleted file mode 100644 index 5541a81f7035f..0000000000000 --- a/cpp/src/arrow/util/variant/variant_io.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_IO_H -#define ARROW_UTIL_VARIANT_IO_H - -#include - -#include - -namespace arrow { -namespace util { - -namespace detail { -// operator<< helper -template -class printer -{ -public: - explicit printer(Out& out) - : out_(out) {} - printer& operator=(printer const&) = delete; - - // visitor - template - void operator()(T const& operand) const - { - out_ << operand; - } - -private: - Out& out_; -}; -} - -// operator<< -template -VARIANT_INLINE std::basic_ostream& -operator<<(std::basic_ostream& out, variant const& rhs) -{ - detail::printer> visitor(out); - apply_visitor(visitor, rhs); - return out; -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_IO_H diff --git a/cpp/src/arrow/util/variant/variant_visitor.h b/cpp/src/arrow/util/variant/variant_visitor.h deleted file mode 100644 index 66b1dfea3d7c9..0000000000000 --- a/cpp/src/arrow/util/variant/variant_visitor.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_VISITOR_HPP -#define ARROW_UTIL_VARIANT_VISITOR_HPP - -#include - -namespace arrow { -namespace util { - -template -struct visitor; - -template -struct visitor : Fn -{ - using Fn::operator(); - - template - visitor(T&& fn) : Fn(std::forward(fn)) {} -}; - -template -struct visitor : Fn, visitor -{ - using Fn::operator(); - using visitor::operator(); - - template - visitor(T&& fn, Ts&&... fns) - : Fn(std::forward(fn)) - , visitor(std::forward(fns)...) {} -}; - -template -visitor::type...> make_visitor(Fns&&... fns) -{ - return visitor::type...> - (std::forward(fns)...); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/util/visibility.h b/cpp/src/arrow/util/visibility.h index 34aa752fd2153..b224717a62d19 100644 --- a/cpp/src/arrow/util/visibility.h +++ b/cpp/src/arrow/util/visibility.h @@ -43,15 +43,6 @@ #endif #endif // Non-Windows -// gcc and clang disagree about how to handle template visibility when you have -// explicit specializations https://llvm.org/bugs/show_bug.cgi?id=24815 - -#if defined(__clang__) -#define ARROW_EXTERN_TEMPLATE extern template class ARROW_EXPORT -#else -#define ARROW_EXTERN_TEMPLATE extern template class -#endif - // This is a complicated topic, some reading on it: // http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ #if defined(_MSC_VER) || defined(__clang__) diff --git a/python/testing/test_hdfs.sh b/cpp/src/arrow/vendored/CMakeLists.txt old mode 100755 new mode 100644 similarity index 80% rename from python/testing/test_hdfs.sh rename to cpp/src/arrow/vendored/CMakeLists.txt index 016e54a66a671..04ea67aa45d04 --- a/python/testing/test_hdfs.sh +++ b/cpp/src/arrow/vendored/CMakeLists.txt @@ -1,5 +1,3 @@ -#!/usr/bin/env bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -set -ex +ARROW_INSTALL_ALL_HEADERS("arrow/vendored") -docker build -t arrow-hdfs-test -f hdfs/Dockerfile . -bash hdfs/restart_docker_container.sh -docker exec -it arrow-hdfs /io/hdfs/run_tests.sh -docker stop arrow-hdfs +add_subdirectory(variant) diff --git a/cpp/src/arrow/util/date.h b/cpp/src/arrow/vendored/date.h similarity index 100% rename from cpp/src/arrow/util/date.h rename to cpp/src/arrow/vendored/date.h diff --git a/cpp/src/arrow/util/string_view/string_view.hpp b/cpp/src/arrow/vendored/string_view.hpp similarity index 100% rename from cpp/src/arrow/util/string_view/string_view.hpp rename to cpp/src/arrow/vendored/string_view.hpp diff --git a/cpp/src/arrow/vendored/variant/CMakeLists.txt b/cpp/src/arrow/vendored/variant/CMakeLists.txt new file mode 100644 index 0000000000000..de26f938d72f3 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARROW_INSTALL_ALL_HEADERS("arrow/vendored/variant") diff --git a/cpp/src/arrow/util/variant/recursive_wrapper.h b/cpp/src/arrow/vendored/variant/recursive_wrapper.hpp similarity index 89% rename from cpp/src/arrow/util/variant/recursive_wrapper.h rename to cpp/src/arrow/vendored/variant/recursive_wrapper.hpp index c9d9385394b38..96b6a3f217f5b 100644 --- a/cpp/src/arrow/util/variant/recursive_wrapper.h +++ b/cpp/src/arrow/vendored/variant/recursive_wrapper.hpp @@ -1,7 +1,9 @@ -#ifndef ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H -#define ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 -// Based on variant/recursive_wrapper.h from boost. +#ifndef MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP +#define MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP + +// Based on variant/recursive_wrapper.hpp from boost. // // Original license: // @@ -15,7 +17,7 @@ #include #include -namespace arrow { +namespace mapbox { namespace util { template @@ -117,6 +119,6 @@ inline void swap(recursive_wrapper& lhs, recursive_wrapper& rhs) noexcept lhs.swap(rhs); } } // namespace util -} // namespace arrow +} // namespace mapbox -#endif // ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +#endif // MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP diff --git a/cpp/src/arrow/vendored/variant/variant.hpp b/cpp/src/arrow/vendored/variant/variant.hpp new file mode 100644 index 0000000000000..bb399dece1d57 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant.hpp @@ -0,0 +1,1029 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_HPP +#define MAPBOX_UTIL_VARIANT_HPP + +#include +#include // size_t +#include // operator new +#include // runtime_error +#include +#include +#include +#include +#include +#include + +#include "recursive_wrapper.hpp" +#include "variant_visitor.hpp" + +// clang-format off +// [[deprecated]] is only available in C++14, use this for the time being +#if __cplusplus <= 201103L +# ifdef __GNUC__ +# define MAPBOX_VARIANT_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define MAPBOX_VARIANT_DEPRECATED __declspec(deprecated) +# else +# define MAPBOX_VARIANT_DEPRECATED +# endif +#else +# define MAPBOX_VARIANT_DEPRECATED [[deprecated]] +#endif + + +#ifdef _MSC_VER +// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx +# ifdef NDEBUG +# define VARIANT_INLINE __forceinline +# else +# define VARIANT_INLINE //__declspec(noinline) +# endif +#else +# ifdef NDEBUG +# define VARIANT_INLINE //inline __attribute__((always_inline)) +# else +# define VARIANT_INLINE __attribute__((noinline)) +# endif +#endif +// clang-format on + +// Exceptions +#if defined( __EXCEPTIONS) || defined( _MSC_VER) +#define HAS_EXCEPTIONS +#endif + +#define VARIANT_MAJOR_VERSION 1 +#define VARIANT_MINOR_VERSION 1 +#define VARIANT_PATCH_VERSION 0 + +#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) + +namespace mapbox { +namespace util { + +// XXX This should derive from std::logic_error instead of std::runtime_error. +// See https://github.com/mapbox/variant/issues/48 for details. +class bad_variant_access : public std::runtime_error +{ + +public: + explicit bad_variant_access(const std::string& what_arg) + : runtime_error(what_arg) {} + + explicit bad_variant_access(const char* what_arg) + : runtime_error(what_arg) {} + +}; // class bad_variant_access + +template +struct MAPBOX_VARIANT_DEPRECATED static_visitor +{ + using result_type = R; + +protected: + static_visitor() {} + ~static_visitor() {} +}; + +namespace detail { + +static constexpr std::size_t invalid_value = std::size_t(-1); + +template +struct direct_type; + +template +struct direct_type +{ + static constexpr std::size_t index = std::is_same::value + ? sizeof...(Types) + : direct_type::index; +}; + +template +struct direct_type +{ + static constexpr std::size_t index = invalid_value; +}; + +#if __cpp_lib_logical_traits >= 201510L + +using std::conjunction; +using std::disjunction; + +#else + +template +struct conjunction : std::true_type {}; + +template +struct conjunction : B1 {}; + +template +struct conjunction : std::conditional::type {}; + +template +struct conjunction : std::conditional, B1>::type {}; + +template +struct disjunction : std::false_type {}; + +template +struct disjunction : B1 {}; + +template +struct disjunction : std::conditional::type {}; + +template +struct disjunction : std::conditional>::type {}; + +#endif + +template +struct convertible_type; + +template +struct convertible_type +{ + static constexpr std::size_t index = std::is_convertible::value + ? disjunction...>::value ? invalid_value : sizeof...(Types) + : convertible_type::index; +}; + +template +struct convertible_type +{ + static constexpr std::size_t index = invalid_value; +}; + +template +struct value_traits +{ + using value_type = typename std::remove_const::type>::type; + static constexpr std::size_t direct_index = direct_type::index; + static constexpr bool is_direct = direct_index != invalid_value; + static constexpr std::size_t index = is_direct ? direct_index : convertible_type::index; + static constexpr bool is_valid = index != invalid_value; + static constexpr std::size_t tindex = is_valid ? sizeof...(Types)-index : 0; + using target_type = typename std::tuple_element>::type; +}; + +template +struct enable_if_type +{ + using type = R; +}; + +template +struct result_of_unary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_unary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct result_of_binary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_binary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct static_max; + +template +struct static_max +{ + static const std::size_t value = arg; +}; + +template +struct static_max +{ + static const std::size_t value = arg1 >= arg2 ? static_max::value : static_max::value; +}; + +template +struct variant_helper; + +template +struct variant_helper +{ + VARIANT_INLINE static void destroy(const std::size_t type_index, void* data) + { + if (type_index == sizeof...(Types)) + { + reinterpret_cast(data)->~T(); + } + else + { + variant_helper::destroy(type_index, data); + } + } + + VARIANT_INLINE static void move(const std::size_t old_type_index, void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(std::move(*reinterpret_cast(old_value))); + } + else + { + variant_helper::move(old_type_index, old_value, new_value); + } + } + + VARIANT_INLINE static void copy(const std::size_t old_type_index, const void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(*reinterpret_cast(old_value)); + } + else + { + variant_helper::copy(old_type_index, old_value, new_value); + } + } +}; + +template <> +struct variant_helper<> +{ + VARIANT_INLINE static void destroy(const std::size_t, void*) {} + VARIANT_INLINE static void move(const std::size_t, void*, void*) {} + VARIANT_INLINE static void copy(const std::size_t, const void*, void*) {} +}; + +template +struct unwrapper +{ + static T const& apply_const(T const& obj) { return obj; } + static T& apply(T& obj) { return obj; } +}; + +template +struct unwrapper> +{ + static auto apply_const(recursive_wrapper const& obj) + -> typename recursive_wrapper::type const& + { + return obj.get(); + } + static auto apply(recursive_wrapper& obj) + -> typename recursive_wrapper::type& + { + return obj.get(); + } +}; + +template +struct unwrapper> +{ + static auto apply_const(std::reference_wrapper const& obj) + -> typename std::reference_wrapper::type const& + { + return obj.get(); + } + static auto apply(std::reference_wrapper& obj) + -> typename std::reference_wrapper::type& + { + return obj.get(); + } +}; + +template +struct dispatcher; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + else + { + return dispatcher::apply_const(v, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply(v.template get_unchecked())); + } + else + { + return dispatcher::apply(v, std::forward(f)); + } + } +}; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + return f(unwrapper::apply(v.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_rhs; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_lhs; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply(v0, v1, std::forward(f)); + } +}; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } +}; + +// comparator functors +struct equal_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs == rhs; + } +}; + +struct less_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs < rhs; + } +}; + +template +class comparer +{ +public: + explicit comparer(Variant const& lhs) noexcept + : lhs_(lhs) {} + comparer& operator=(comparer const&) = delete; + // visitor + template + bool operator()(T const& rhs_content) const + { + T const& lhs_content = lhs_.template get_unchecked(); + return Comp()(lhs_content, rhs_content); + } + +private: + Variant const& lhs_; +}; + +// hashing visitor +struct hasher +{ + template + std::size_t operator()(const T& hashable) const + { + return std::hash{}(hashable); + } +}; + +} // namespace detail + +struct no_init +{ +}; + +template +class variant +{ + static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty"); + static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); + +private: + static const std::size_t data_size = detail::static_max::value; + static const std::size_t data_align = detail::static_max::value; +public: + struct adapted_variant_tag; + using types = std::tuple; +private: + using first_type = typename std::tuple_element<0, types>::type; + using data_type = typename std::aligned_storage::type; + using helper_type = detail::variant_helper; + + std::size_t type_index; + data_type data; + +public: + VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) + : type_index(sizeof...(Types)-1) + { + static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant"); + new (&data) first_type(); + } + + VARIANT_INLINE variant(no_init) noexcept + : type_index(detail::invalid_value) {} + + // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers + template , + typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > + VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) + : type_index(Traits::index) + { + new (&data) typename Traits::target_type(std::forward(val)); + } + + VARIANT_INLINE variant(variant const& old) + : type_index(old.type_index) + { + helper_type::copy(old.type_index, &old.data, &data); + } + + VARIANT_INLINE variant(variant&& old) + noexcept(detail::conjunction...>::value) + : type_index(old.type_index) + { + helper_type::move(old.type_index, &old.data, &data); + } + +private: + VARIANT_INLINE void copy_assign(variant const& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::copy(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + + VARIANT_INLINE void move_assign(variant&& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::move(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + +public: + VARIANT_INLINE variant& operator=(variant&& other) + { + move_assign(std::move(other)); + return *this; + } + + VARIANT_INLINE variant& operator=(variant const& other) + { + copy_assign(other); + return *this; + } + + // conversions + // move-assign + template + VARIANT_INLINE variant& operator=(T&& rhs) noexcept + { + variant temp(std::forward(rhs)); + move_assign(std::move(temp)); + return *this; + } + + // copy-assign + template + VARIANT_INLINE variant& operator=(T const& rhs) + { + variant temp(rhs); + copy_assign(temp); + return *this; + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type::index; + } + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type, Types...>::index; + } + + VARIANT_INLINE bool valid() const + { + return type_index != detail::invalid_value; + } + + template + VARIANT_INLINE void set(Args&&... args) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + new (&data) T(std::forward(args)...); + type_index = detail::direct_type::index; + } + + // get_unchecked() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + // get() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // This function is deprecated because it returns an internal index field. + // Use which() instead. + MAPBOX_VARIANT_DEPRECATED VARIANT_INLINE std::size_t get_type_index() const + { + return type_index; + } + + VARIANT_INLINE int which() const noexcept + { + return static_cast(sizeof...(Types)-type_index - 1); + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE static constexpr int which() noexcept + { + return static_cast(sizeof...(Types)-detail::direct_type::index - 1); + } + + // visitor + // unary + template ::type> + auto VARIANT_INLINE static visit(V const& v, F&& f) + -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) + { + return detail::dispatcher::apply_const(v, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static visit(V& v, F&& f) + -> decltype(detail::dispatcher::apply(v, std::forward(f))) + { + return detail::dispatcher::apply(v, std::forward(f)); + } + + // binary + // const + template ::type> + auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); + } + + // match + // unary + template + auto VARIANT_INLINE match(Fs&&... fs) const + -> decltype(variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...)); + } + // non-const + template + auto VARIANT_INLINE match(Fs&&... fs) + -> decltype(variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...)); + } + + ~variant() noexcept // no-throw destructor + { + helper_type::destroy(type_index, &data); + } + + // comparison operators + // equality + VARIANT_INLINE bool operator==(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return false; + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + + VARIANT_INLINE bool operator!=(variant const& rhs) const + { + return !(*this == rhs); + } + + // less than + VARIANT_INLINE bool operator<(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return this->which() < rhs.which(); + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + VARIANT_INLINE bool operator>(variant const& rhs) const + { + return rhs < *this; + } + VARIANT_INLINE bool operator<=(variant const& rhs) const + { + return !(*this > rhs); + } + VARIANT_INLINE bool operator>=(variant const& rhs) const + { + return !(*this < rhs); + } +}; + +// unary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// binary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// getter interface + +#ifdef HAS_EXCEPTIONS +template +auto get(T& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType& get_unchecked(T& var) +{ + return var.template get_unchecked(); +} + +#ifdef HAS_EXCEPTIONS +template +auto get(T const& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType const& get_unchecked(T const& var) +{ + return var.template get_unchecked(); +} +} // namespace util +} // namespace mapbox + +// hashable iff underlying types are hashable +namespace std { +template +struct hash< ::mapbox::util::variant> { + std::size_t operator()(const ::mapbox::util::variant& v) const noexcept + { + return ::mapbox::util::apply_visitor(::mapbox::util::detail::hasher{}, v); + } +}; +} + +#endif // MAPBOX_UTIL_VARIANT_HPP diff --git a/cpp/src/arrow/vendored/variant/variant_io.hpp b/cpp/src/arrow/vendored/variant/variant_io.hpp new file mode 100644 index 0000000000000..494d2a964e319 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant_io.hpp @@ -0,0 +1,47 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_IO_HPP +#define MAPBOX_UTIL_VARIANT_IO_HPP + +#include + +#include "variant.hpp" + +namespace mapbox { +namespace util { + +namespace detail { +// operator<< helper +template +class printer +{ +public: + explicit printer(Out& out) + : out_(out) {} + printer& operator=(printer const&) = delete; + + // visitor + template + void operator()(T const& operand) const + { + out_ << operand; + } + +private: + Out& out_; +}; +} + +// operator<< +template +VARIANT_INLINE std::basic_ostream& +operator<<(std::basic_ostream& out, variant const& rhs) +{ + detail::printer> visitor(out); + apply_visitor(visitor, rhs); + return out; +} +} // namespace util +} // namespace mapbox + +#endif // MAPBOX_UTIL_VARIANT_IO_HPP diff --git a/cpp/src/arrow/vendored/variant/variant_visitor.hpp b/cpp/src/arrow/vendored/variant/variant_visitor.hpp new file mode 100644 index 0000000000000..60020f4dd05dc --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant_visitor.hpp @@ -0,0 +1,40 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_VISITOR_HPP +#define MAPBOX_UTIL_VARIANT_VISITOR_HPP + +namespace mapbox { +namespace util { + +template +struct visitor; + +template +struct visitor : Fn +{ + using type = Fn; + using Fn::operator(); + + visitor(Fn fn) : Fn(fn) {} +}; + +template +struct visitor : Fn, visitor +{ + using type = visitor; + using Fn::operator(); + using visitor::operator(); + + visitor(Fn fn, Fns... fns) : Fn(fn), visitor(fns...) {} +}; + +template +visitor make_visitor(Fns... fns) +{ + return visitor(fns...); +} + +} // namespace util +} // namespace mapbox + +#endif // MAPBOX_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/util/xxhash/xxhash.c b/cpp/src/arrow/vendored/xxhash/xxhash.c similarity index 100% rename from cpp/src/arrow/util/xxhash/xxhash.c rename to cpp/src/arrow/vendored/xxhash/xxhash.c diff --git a/cpp/src/arrow/util/xxhash/xxhash.h b/cpp/src/arrow/vendored/xxhash/xxhash.h similarity index 100% rename from cpp/src/arrow/util/xxhash/xxhash.h rename to cpp/src/arrow/vendored/xxhash/xxhash.h diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index 47dba6cd8ddf2..0098e27590d10 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -30,33 +30,33 @@ namespace arrow { return Status::NotImplemented(array.type()->ToString()); \ } -ARRAY_VISITOR_DEFAULT(NullArray); -ARRAY_VISITOR_DEFAULT(BooleanArray); -ARRAY_VISITOR_DEFAULT(Int8Array); -ARRAY_VISITOR_DEFAULT(Int16Array); -ARRAY_VISITOR_DEFAULT(Int32Array); -ARRAY_VISITOR_DEFAULT(Int64Array); -ARRAY_VISITOR_DEFAULT(UInt8Array); -ARRAY_VISITOR_DEFAULT(UInt16Array); -ARRAY_VISITOR_DEFAULT(UInt32Array); -ARRAY_VISITOR_DEFAULT(UInt64Array); -ARRAY_VISITOR_DEFAULT(HalfFloatArray); -ARRAY_VISITOR_DEFAULT(FloatArray); -ARRAY_VISITOR_DEFAULT(DoubleArray); -ARRAY_VISITOR_DEFAULT(BinaryArray); -ARRAY_VISITOR_DEFAULT(StringArray); -ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray); -ARRAY_VISITOR_DEFAULT(Date32Array); -ARRAY_VISITOR_DEFAULT(Date64Array); -ARRAY_VISITOR_DEFAULT(Time32Array); -ARRAY_VISITOR_DEFAULT(Time64Array); -ARRAY_VISITOR_DEFAULT(TimestampArray); -ARRAY_VISITOR_DEFAULT(IntervalArray); -ARRAY_VISITOR_DEFAULT(ListArray); -ARRAY_VISITOR_DEFAULT(StructArray); -ARRAY_VISITOR_DEFAULT(UnionArray); -ARRAY_VISITOR_DEFAULT(DictionaryArray); -ARRAY_VISITOR_DEFAULT(Decimal128Array); +ARRAY_VISITOR_DEFAULT(NullArray) +ARRAY_VISITOR_DEFAULT(BooleanArray) +ARRAY_VISITOR_DEFAULT(Int8Array) +ARRAY_VISITOR_DEFAULT(Int16Array) +ARRAY_VISITOR_DEFAULT(Int32Array) +ARRAY_VISITOR_DEFAULT(Int64Array) +ARRAY_VISITOR_DEFAULT(UInt8Array) +ARRAY_VISITOR_DEFAULT(UInt16Array) +ARRAY_VISITOR_DEFAULT(UInt32Array) +ARRAY_VISITOR_DEFAULT(UInt64Array) +ARRAY_VISITOR_DEFAULT(HalfFloatArray) +ARRAY_VISITOR_DEFAULT(FloatArray) +ARRAY_VISITOR_DEFAULT(DoubleArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) +ARRAY_VISITOR_DEFAULT(Date32Array) +ARRAY_VISITOR_DEFAULT(Date64Array) +ARRAY_VISITOR_DEFAULT(Time32Array) +ARRAY_VISITOR_DEFAULT(Time64Array) +ARRAY_VISITOR_DEFAULT(TimestampArray) +ARRAY_VISITOR_DEFAULT(IntervalArray) +ARRAY_VISITOR_DEFAULT(ListArray) +ARRAY_VISITOR_DEFAULT(StructArray) +ARRAY_VISITOR_DEFAULT(UnionArray) +ARRAY_VISITOR_DEFAULT(DictionaryArray) +ARRAY_VISITOR_DEFAULT(Decimal128Array) #undef ARRAY_VISITOR_DEFAULT @@ -68,33 +68,33 @@ ARRAY_VISITOR_DEFAULT(Decimal128Array); return Status::NotImplemented(type.ToString()); \ } -TYPE_VISITOR_DEFAULT(NullType); -TYPE_VISITOR_DEFAULT(BooleanType); -TYPE_VISITOR_DEFAULT(Int8Type); -TYPE_VISITOR_DEFAULT(Int16Type); -TYPE_VISITOR_DEFAULT(Int32Type); -TYPE_VISITOR_DEFAULT(Int64Type); -TYPE_VISITOR_DEFAULT(UInt8Type); -TYPE_VISITOR_DEFAULT(UInt16Type); -TYPE_VISITOR_DEFAULT(UInt32Type); -TYPE_VISITOR_DEFAULT(UInt64Type); -TYPE_VISITOR_DEFAULT(HalfFloatType); -TYPE_VISITOR_DEFAULT(FloatType); -TYPE_VISITOR_DEFAULT(DoubleType); -TYPE_VISITOR_DEFAULT(StringType); -TYPE_VISITOR_DEFAULT(BinaryType); -TYPE_VISITOR_DEFAULT(FixedSizeBinaryType); -TYPE_VISITOR_DEFAULT(Date64Type); -TYPE_VISITOR_DEFAULT(Date32Type); -TYPE_VISITOR_DEFAULT(Time32Type); -TYPE_VISITOR_DEFAULT(Time64Type); -TYPE_VISITOR_DEFAULT(TimestampType); -TYPE_VISITOR_DEFAULT(IntervalType); -TYPE_VISITOR_DEFAULT(Decimal128Type); -TYPE_VISITOR_DEFAULT(ListType); -TYPE_VISITOR_DEFAULT(StructType); -TYPE_VISITOR_DEFAULT(UnionType); -TYPE_VISITOR_DEFAULT(DictionaryType); +TYPE_VISITOR_DEFAULT(NullType) +TYPE_VISITOR_DEFAULT(BooleanType) +TYPE_VISITOR_DEFAULT(Int8Type) +TYPE_VISITOR_DEFAULT(Int16Type) +TYPE_VISITOR_DEFAULT(Int32Type) +TYPE_VISITOR_DEFAULT(Int64Type) +TYPE_VISITOR_DEFAULT(UInt8Type) +TYPE_VISITOR_DEFAULT(UInt16Type) +TYPE_VISITOR_DEFAULT(UInt32Type) +TYPE_VISITOR_DEFAULT(UInt64Type) +TYPE_VISITOR_DEFAULT(HalfFloatType) +TYPE_VISITOR_DEFAULT(FloatType) +TYPE_VISITOR_DEFAULT(DoubleType) +TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) +TYPE_VISITOR_DEFAULT(Date64Type) +TYPE_VISITOR_DEFAULT(Date32Type) +TYPE_VISITOR_DEFAULT(Time32Type) +TYPE_VISITOR_DEFAULT(Time64Type) +TYPE_VISITOR_DEFAULT(TimestampType) +TYPE_VISITOR_DEFAULT(IntervalType) +TYPE_VISITOR_DEFAULT(Decimal128Type) +TYPE_VISITOR_DEFAULT(ListType) +TYPE_VISITOR_DEFAULT(StructType) +TYPE_VISITOR_DEFAULT(UnionType) +TYPE_VISITOR_DEFAULT(DictionaryType) #undef TYPE_VISITOR_DEFAULT diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index b6fc1f1ff2bfb..a5deaa7a1d22c 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -121,7 +121,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { // The scalar value's type depends on the array data type: // - the type's `c_type`, if any // - for boolean arrays, a `bool` -// - for binary, string and fixed-size binary arrars, a `util::string_view` +// - for binary, string and fixed-size binary arrays, a `util::string_view` template struct ArrayDataVisitor {}; diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index bd497dcb92882..e743b0e041cb8 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,31 +15,39 @@ # specific language governing permissions and limitations # under the License. -# LLVM/Clang is required by multiple subdirs. -cmake_minimum_required(VERSION 3.11) - -project(gandiva) - -find_package(LLVM) +set(GANDIVA_VERSION "${ARROW_VERSION}") # For "make gandiva" to build everything Gandiva-related +add_custom_target(gandiva-all) add_custom_target(gandiva) +add_custom_target(gandiva-tests) +add_custom_target(gandiva-benchmarks) + +add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) + +find_package(LLVM) # Set the path where the byte-code files will be installed. set(GANDIVA_BC_INSTALL_DIR - ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gandiva) + ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/gandiva) set(GANDIVA_BC_FILE_NAME irhelpers.bc) set(GANDIVA_BC_INSTALL_PATH ${GANDIVA_BC_INSTALL_DIR}/${GANDIVA_BC_FILE_NAME}) -set(GANDIVA_BC_OUTPUT_PATH ${BUILD_OUTPUT_ROOT_DIRECTORY}/${GANDIVA_BC_FILE_NAME}) +set(GANDIVA_BC_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/${GANDIVA_BC_FILE_NAME}) +install(FILES + ${GANDIVA_BC_OUTPUT_PATH} + DESTINATION ${GANDIVA_BC_INSTALL_DIR}) set(BC_FILE_PATH_CC "${CMAKE_CURRENT_BINARY_DIR}/bc_file_path.cc") configure_file(bc_file_path.cc.in ${BC_FILE_PATH_CC}) +add_definitions(-DGANDIVA_BYTE_COMPILE_FILE_PATH="${GANDIVA_BC_OUTPUT_PATH}") set(SRC_FILES annotator.cc bitmap_accumulator.cc configuration.cc context_helper.cc + decimal_ir.cc + decimal_type_util.cc engine.cc date_utils.cc expr_decomposer.cc @@ -48,7 +56,14 @@ set(SRC_FILES annotator.cc expression_registry.cc exported_funcs_registry.cc filter.cc + function_ir_builder.cc function_registry.cc + function_registry_arithmetic.cc + function_registry_datetime.cc + function_registry_hash.cc + function_registry_math_ops.cc + function_registry_string.cc + function_registry_timestamp_arithmetic.cc function_signature.cc gdv_function_stubs.cc llvm_generator.cc @@ -59,14 +74,10 @@ set(SRC_FILES annotator.cc selection_vector.cc tree_expr_builder.cc to_date_holder.cc - ${SHARED_HELPER_FILES} ${BC_FILE_PATH_CC}) set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared - ${BOOST_REGEX_LIBRARY} - ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} LLVM::LLVM_INTERFACE ${RE2_LIBRARY}) @@ -98,35 +109,11 @@ add_dependencies(gandiva ${GANDIVA_LIBRARIES}) # install for gandiva include(GNUInstallDirs) -# install libgandiva -install( - TARGETS gandiva_shared gandiva_static - DESTINATION ${CMAKE_INSTALL_LIBDIR} -) - # install the header files. -install(FILES - arrow.h - condition.h - configuration.h - expression.h - expression_registry.h - filter.h - function_signature.h - gandiva_aliases.h - logging.h - projector.h - selection_vector.h - tree_expr_builder.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/gandiva") +ARROW_INSTALL_ALL_HEADERS("gandiva") # pkg-config support -configure_file(gandiva.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("gandiva") set(GANDIVA_STATIC_TEST_LINK_LIBS gandiva_static @@ -143,57 +130,59 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(multi_value_args) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (NO_TESTS) + return() + endif() + set(TEST_ARGUMENTS ENABLED PREFIX "gandiva" - LABELS "unittest;gandiva" + LABELS "gandiva-tests" ${ARG_UNPARSED_ARGUMENTS}) # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} ${TEST_ARGUMENTS} STATIC_LINK_LIBS ${GANDIVA_STATIC_TEST_LINK_LIBS}) else() - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} ${TEST_ARGUMENTS} STATIC_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) endif() - if(${REL_TEST_NAME} MATCHES "llvm" OR - ${REL_TEST_NAME} MATCHES "expression_registry") + set(TARGET_NAME gandiva-${REL_TEST_NAME}) + + if((TARGET ${TARGET_NAME}) AND + (${REL_TEST_NAME} MATCHES "llvm" OR + ${REL_TEST_NAME} MATCHES "expression_registry")) # If the unit test has llvm in its name, include llvm. - add_dependencies(gandiva-${REL_TEST_NAME} LLVM::LLVM_INTERFACE) - target_link_libraries(gandiva-${REL_TEST_NAME} PRIVATE LLVM::LLVM_INTERFACE) + add_dependencies(${TARGET_NAME} LLVM::LLVM_INTERFACE) + target_link_libraries(${TARGET_NAME} PRIVATE LLVM::LLVM_INTERFACE) endif() endfunction() -if (ARROW_GANDIVA_BUILD_TESTS) - ADD_GANDIVA_TEST(bitmap_accumulator_test) - ADD_GANDIVA_TEST(engine_llvm_test) - ADD_GANDIVA_TEST(function_signature_test) - ADD_GANDIVA_TEST(function_registry_test) - ADD_GANDIVA_TEST(llvm_types_test) - ADD_GANDIVA_TEST(llvm_generator_test) - ADD_GANDIVA_TEST(annotator_test) - ADD_GANDIVA_TEST(tree_expr_test) - ADD_GANDIVA_TEST(expr_decomposer_test) - ADD_GANDIVA_TEST(expression_registry_test) - ADD_GANDIVA_TEST(selection_vector_test) - ADD_GANDIVA_TEST(lru_cache_test) - ADD_GANDIVA_TEST(to_date_holder_test) - ADD_GANDIVA_TEST(simple_arena_test) -endif() +ADD_GANDIVA_TEST(bitmap_accumulator_test) +ADD_GANDIVA_TEST(engine_llvm_test) +ADD_GANDIVA_TEST(function_signature_test) +ADD_GANDIVA_TEST(function_registry_test) +ADD_GANDIVA_TEST(llvm_types_test) +ADD_GANDIVA_TEST(llvm_generator_test) +ADD_GANDIVA_TEST(annotator_test) +ADD_GANDIVA_TEST(tree_expr_test) +ADD_GANDIVA_TEST(expr_decomposer_test) +ADD_GANDIVA_TEST(expression_registry_test) +ADD_GANDIVA_TEST(selection_vector_test) +ADD_GANDIVA_TEST(lru_cache_test) +ADD_GANDIVA_TEST(to_date_holder_test) +ADD_GANDIVA_TEST(simple_arena_test) +ADD_GANDIVA_TEST(like_holder_test) +ADD_GANDIVA_TEST(decimal_type_util_test) if (ARROW_GANDIVA_JAVA) add_subdirectory(jni) endif() -add_subdirectory(precompiled) - -if (ARROW_GANDIVA_BUILD_TESTS) - include(CTest) - enable_testing() - add_subdirectory(tests) -endif() +add_subdirectory(precompiled) +add_subdirectory(tests) diff --git a/cpp/src/gandiva/arrow.h b/cpp/src/gandiva/arrow.h index ea283523a56dc..cc2bd9a10294b 100644 --- a/cpp/src/gandiva/arrow.h +++ b/cpp/src/gandiva/arrow.h @@ -35,6 +35,9 @@ using ArrayPtr = std::shared_ptr; using DataTypePtr = std::shared_ptr; using DataTypeVector = std::vector; +using Decimal128TypePtr = std::shared_ptr; +using Decimal128TypeVector = std::vector; + using FieldPtr = std::shared_ptr; using FieldVector = std::vector; @@ -48,6 +51,14 @@ using ArrayDataVector = std::vector; using Status = arrow::Status; using StatusCode = arrow::StatusCode; +static inline bool is_decimal_128(DataTypePtr type) { + if (type->id() == arrow::Type::DECIMAL) { + auto decimal_type = arrow::internal::checked_cast(type.get()); + return decimal_type->byte_width() == 16; + } else { + return false; + } +} } // namespace gandiva #endif // GANDIVA_EXPR_ARROW_H diff --git a/cpp/src/gandiva/basic_decimal_scalar.h b/cpp/src/gandiva/basic_decimal_scalar.h new file mode 100644 index 0000000000000..fab82277cc978 --- /dev/null +++ b/cpp/src/gandiva/basic_decimal_scalar.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include "arrow/util/basic_decimal.h" + +namespace gandiva { + +using arrow::BasicDecimal128; + +/// Represents a 128-bit decimal value along with its precision and scale. +class BasicDecimalScalar128 { + public: + BasicDecimalScalar128(int64_t high_bits, uint64_t low_bits, int32_t precision, + int32_t scale) + : value_(high_bits, low_bits), precision_(precision), scale_(scale) {} + + BasicDecimalScalar128(const BasicDecimal128& value, int32_t precision, int32_t scale) + : value_(value), precision_(precision), scale_(scale) {} + + BasicDecimalScalar128(int32_t precision, int32_t scale) + : precision_(precision), scale_(scale) {} + + int32_t scale() const { return scale_; } + + int32_t precision() const { return precision_; } + + const BasicDecimal128& value() const { return value_; } + + private: + BasicDecimal128 value_; + int32_t precision_; + int32_t scale_; +}; + +inline bool operator==(const BasicDecimalScalar128& left, + const BasicDecimalScalar128& right) { + return left.value() == right.value() && left.precision() == right.precision() && + left.scale() == right.scale(); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/bc_file_path.cc.in b/cpp/src/gandiva/bc_file_path.cc.in index d6b4e342b6714..54e81ca2bfa18 100644 --- a/cpp/src/gandiva/bc_file_path.cc.in +++ b/cpp/src/gandiva/bc_file_path.cc.in @@ -18,6 +18,6 @@ namespace gandiva { // Path to the byte-code file. -extern const char kByteCodeFilePath[] = "${GANDIVA_BC_OUTPUT_PATH}"; +extern const char kByteCodeFilePath[] = "${GANDIVA_BC_INSTALL_PATH}"; } // namespace gandiva diff --git a/cpp/src/gandiva/bitmap_accumulator_test.cc b/cpp/src/gandiva/bitmap_accumulator_test.cc index fc89421344e83..53e8aaca21ff1 100644 --- a/cpp/src/gandiva/bitmap_accumulator_test.cc +++ b/cpp/src/gandiva/bitmap_accumulator_test.cc @@ -32,9 +32,8 @@ class TestBitMapAccumulator : public ::testing::Test { int nrecords); }; -void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, int nrecords) { - int nbytes = nrecords / 8; - unsigned int cur; +void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, int nbytes) { + unsigned int cur = 0; for (int i = 0; i < nbytes; ++i) { rand_r(&cur); @@ -62,7 +61,7 @@ TEST_F(TestBitMapAccumulator, TestIntersectBitMaps) { uint8_t expected_bitmap[length]; for (int i = 0; i < 4; i++) { - FillBitMap(src_bitmaps[i], nrecords); + FillBitMap(src_bitmaps[i], length); } for (int i = 0; i < 4; i++) { diff --git a/cpp/src/gandiva/date_utils.cc b/cpp/src/gandiva/date_utils.cc index 2686b193500ff..8a7e1f03fbd20 100644 --- a/cpp/src/gandiva/date_utils.cc +++ b/cpp/src/gandiva/date_utils.cc @@ -75,11 +75,8 @@ Status DateUtils::ToInternalFormat(const std::string& format, buffer.str(""); continue; } else { - if (buffer.str().length() > 0) { - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << i; - return Status::Invalid(err_msg.str()); - } + ARROW_RETURN_IF(buffer.str().length() > 0, + Status::Invalid("Invalid date format string '", format, "'")); is_in_quoted_text = true; continue; @@ -156,10 +153,7 @@ Status DateUtils::ToInternalFormat(const std::string& format, } } } else { - // no potential matches found - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << i; - return Status::Invalid(err_msg.str()); + return Status::Invalid("Invalid date format string '", format, "'"); } } @@ -170,11 +164,10 @@ Status DateUtils::ToInternalFormat(const std::string& format, if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) { builder << sql_date_format_to_boost_map_[exactMatches[0]]; } else { - // we didn't successfully parse the entire string + // Format partially parsed int64_t pos = format.length() - buffer.str().length(); - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << pos; - return Status::Invalid(err_msg.str()); + return Status::Invalid("Invalid date format string '", format, "' at position ", + pos); } } std::string final_pattern = builder.str(); diff --git a/cpp/src/gandiva/decimal_ir.cc b/cpp/src/gandiva/decimal_ir.cc new file mode 100644 index 0000000000000..d10158a6f0487 --- /dev/null +++ b/cpp/src/gandiva/decimal_ir.cc @@ -0,0 +1,401 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/status.h" +#include "gandiva/decimal_ir.h" +#include "gandiva/decimal_type_util.h" + +// Algorithms adapted from Apache Impala + +namespace gandiva { + +#define ADD_TRACE_32(msg, value) \ + if (enable_ir_traces_) { \ + AddTrace32(msg, value); \ + } +#define ADD_TRACE_128(msg, value) \ + if (enable_ir_traces_) { \ + AddTrace128(msg, value); \ + } + +const char* DecimalIR::kScaleMultipliersName = "gandivaScaleMultipliers"; + +/// Populate globals required by decimal IR. +/// TODO: can this be done just once ? +void DecimalIR::AddGlobals(Engine* engine) { + auto types = engine->types(); + + // populate vector : [ 1, 10, 100, 1000, ..] + std::string value = "1"; + std::vector scale_multipliers; + for (int i = 0; i < DecimalTypeUtil::kMaxPrecision + 1; ++i) { + auto multiplier = + llvm::ConstantInt::get(llvm::Type::getInt128Ty(*engine->context()), value, 10); + scale_multipliers.push_back(multiplier); + value.append("0"); + } + + auto array_type = + llvm::ArrayType::get(types->i128_type(), DecimalTypeUtil::kMaxPrecision + 1); + auto initializer = llvm::ConstantArray::get( + array_type, llvm::ArrayRef(scale_multipliers)); + + auto globalScaleMultipliers = new llvm::GlobalVariable( + *engine->module(), array_type, true /*constant*/, + llvm::GlobalValue::LinkOnceAnyLinkage, initializer, kScaleMultipliersName); + globalScaleMultipliers->setAlignment(16); +} + +// Lookup intrinsic functions +void DecimalIR::InitializeIntrinsics() { + sadd_with_overflow_fn_ = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::sadd_with_overflow, types()->i128_type()); + DCHECK_NE(sadd_with_overflow_fn_, nullptr); + + smul_with_overflow_fn_ = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::smul_with_overflow, types()->i128_type()); + DCHECK_NE(smul_with_overflow_fn_, nullptr); + + i128_with_overflow_struct_type_ = + sadd_with_overflow_fn_->getFunctionType()->getReturnType(); +} + +// CPP: return kScaleMultipliers[scale] +llvm::Value* DecimalIR::GetScaleMultiplier(llvm::Value* scale) { + auto const_array = module()->getGlobalVariable(kScaleMultipliersName); + auto ptr = ir_builder()->CreateGEP(const_array, {types()->i32_constant(0), scale}); + return ir_builder()->CreateLoad(ptr); +} + +// CPP: x <= y ? y : x +llvm::Value* DecimalIR::GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale) { + llvm::Value* le = ir_builder()->CreateICmpSLE(x_scale, y_scale); + return ir_builder()->CreateSelect(le, y_scale, x_scale); +} + +// CPP: return (increase_scale_by <= 0) ? +// in_value : in_value * GetScaleMultiplier(increase_scale_by) +llvm::Value* DecimalIR::IncreaseScale(llvm::Value* in_value, + llvm::Value* increase_scale_by) { + llvm::Value* le_zero = + ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0)); + // then block + auto then_lambda = [&] { return in_value; }; + + // else block + auto else_lambda = [&] { + llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by); + return ir_builder()->CreateMul(in_value, multiplier); + }; + + return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda); +} + +// CPP: return (increase_scale_by <= 0) ? +// {in_value,false} : {in_value * GetScaleMultiplier(increase_scale_by),true} +// +// The return value also indicates if there was an overflow while increasing the scale. +DecimalIR::ValueWithOverflow DecimalIR::IncreaseScaleWithOverflowCheck( + llvm::Value* in_value, llvm::Value* increase_scale_by) { + llvm::Value* le_zero = + ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0)); + + // then block + auto then_lambda = [&] { + ValueWithOverflow ret{in_value, types()->false_constant()}; + return ret.AsStruct(this); + }; + + // else block + auto else_lambda = [&] { + llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by); + return ir_builder()->CreateCall(smul_with_overflow_fn_, {in_value, multiplier}); + }; + + auto ir_struct = + BuildIfElse(le_zero, i128_with_overflow_struct_type_, then_lambda, else_lambda); + return ValueWithOverflow::MakeFromStruct(this, ir_struct); +} + +// CPP: return (reduce_scale_by <= 0) ? +// in_value : in_value / GetScaleMultiplier(reduce_scale_by) +// +// ReduceScale cannot cause an overflow. +llvm::Value* DecimalIR::ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by) { + auto le_zero = ir_builder()->CreateICmpSLE(reduce_scale_by, types()->i32_constant(0)); + // then block + auto then_lambda = [&] { return in_value; }; + + // else block + auto else_lambda = [&] { + // TODO : handle rounding. + llvm::Value* multiplier = GetScaleMultiplier(reduce_scale_by); + return ir_builder()->CreateSDiv(in_value, multiplier); + }; + + return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda); +} + +/// @brief Fast-path for add +/// Adjust x and y to the same scale, and add them. +llvm::Value* DecimalIR::AddFastPath(const ValueFull& x, const ValueFull& y) { + auto higher_scale = GetHigherScale(x.scale(), y.scale()); + ADD_TRACE_32("AddFastPath : higher_scale", higher_scale); + + // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x_scale) + auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale()); + auto x_scaled = IncreaseScale(x.value(), x_delta); + ADD_TRACE_128("AddFastPath : x_scaled", x_scaled); + + // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale) + auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale()); + auto y_scaled = IncreaseScale(y.value(), y_delta); + ADD_TRACE_128("AddFastPath : y_scaled", y_scaled); + + auto sum = ir_builder()->CreateAdd(x_scaled, y_scaled); + ADD_TRACE_128("AddFastPath : sum", sum); + return sum; +} + +// @brief Add with overflow check. +/// Adjust x and y to the same scale, add them, and reduce sum to output scale. +/// If there is an overflow, the sum is set to 0. +DecimalIR::ValueWithOverflow DecimalIR::AddWithOverflowCheck(const ValueFull& x, + const ValueFull& y, + const ValueFull& out) { + auto higher_scale = GetHigherScale(x.scale(), y.scale()); + ADD_TRACE_32("AddWithOverflowCheck : higher_scale", higher_scale); + + // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x.scale()) + auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale()); + auto x_scaled = IncreaseScaleWithOverflowCheck(x.value(), x_delta); + ADD_TRACE_128("AddWithOverflowCheck : x_scaled", x_scaled.value()); + + // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale) + auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale()); + auto y_scaled = IncreaseScaleWithOverflowCheck(y.value(), y_delta); + ADD_TRACE_128("AddWithOverflowCheck : y_scaled", y_scaled.value()); + + // CPP : sum = x_scaled + y_scaled + auto sum_ir_struct = ir_builder()->CreateCall(sadd_with_overflow_fn_, + {x_scaled.value(), y_scaled.value()}); + auto sum = ValueWithOverflow::MakeFromStruct(this, sum_ir_struct); + ADD_TRACE_128("AddWithOverflowCheck : sum", sum.value()); + + // CPP : overflow ? 0 : sum / GetScaleMultiplier(max_scale - out_scale) + auto overflow = GetCombinedOverflow({x_scaled, y_scaled, sum}); + ADD_TRACE_32("AddWithOverflowCheck : overflow", overflow); + auto then_lambda = [&] { + // if there is an overflow, the value returned won't be used. so, save the division. + return types()->i128_constant(0); + }; + auto else_lambda = [&] { + auto reduce_scale_by = ir_builder()->CreateSub(higher_scale, out.scale()); + return ReduceScale(sum.value(), reduce_scale_by); + }; + auto sum_descaled = + BuildIfElse(overflow, types()->i128_type(), then_lambda, else_lambda); + return ValueWithOverflow(sum_descaled, overflow); +} + +// This is pretty complex, so use CPP fns. +llvm::Value* DecimalIR::AddLarge(const ValueFull& x, const ValueFull& y, + const ValueFull& out) { + auto block = ir_builder()->GetInsertBlock(); + auto out_high_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_hi", block); + auto out_low_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_low", block); + auto x_split = ValueSplit::MakeFromInt128(this, x.value()); + auto y_split = ValueSplit::MakeFromInt128(this, y.value()); + + std::vector args = { + x_split.high(), x_split.low(), x.precision(), x.scale(), + y_split.high(), y_split.low(), y.precision(), y.scale(), + out.precision(), out.scale(), out_high_ptr, out_low_ptr, + }; + ir_builder()->CreateCall(module()->getFunction("add_large_decimal128_decimal128"), + args); + + auto out_high = ir_builder()->CreateLoad(out_high_ptr); + auto out_low = ir_builder()->CreateLoad(out_low_ptr); + auto sum = ValueSplit(out_high, out_low).AsInt128(this); + ADD_TRACE_128("AddLarge : sum", sum); + return sum; +} + +/// The output scale/precision cannot be arbitary values. The algo here depends on them +/// to be the same as computed in DecimalTypeSql. +/// TODO: enforce this. +Status DecimalIR::BuildAdd() { + // Create fn prototype : + // int128_t + // add_decimal128_decimal128(int128_t x_value, int32_t x_precision, int32_t x_scale, + // int128_t y_value, int32_t y_precision, int32_t y_scale + // int32_t out_precision, int32_t out_scale) + auto i32 = types()->i32_type(); + auto i128 = types()->i128_type(); + auto function = BuildFunction("add_decimal128_decimal128", i128, + { + {"x_value", i128}, + {"x_precision", i32}, + {"x_scale", i32}, + {"y_value", i128}, + {"y_precision", i32}, + {"y_scale", i32}, + {"out_precision", i32}, + {"out_scale", i32}, + }); + + auto arg_iter = function->arg_begin(); + ValueFull x(&arg_iter[0], &arg_iter[1], &arg_iter[2]); + ValueFull y(&arg_iter[3], &arg_iter[4], &arg_iter[5]); + ValueFull out(nullptr, &arg_iter[6], &arg_iter[7]); + + auto entry = llvm::BasicBlock::Create(*context(), "entry", function); + ir_builder()->SetInsertPoint(entry); + + // CPP : + // if (out_precision < 38) { + // return AddFastPath(x, y) + // } else { + // ret = AddWithOverflowCheck(x, y) + // if (ret.overflow) + // return AddLarge(x, y) + // else + // return ret.value; + // } + llvm::Value* lt_max_precision = ir_builder()->CreateICmpSLT( + out.precision(), types()->i32_constant(DecimalTypeUtil::kMaxPrecision)); + auto then_lambda = [&] { + // fast-path add + return AddFastPath(x, y); + }; + auto else_lambda = [&] { + if (kUseOverflowIntrinsics) { + // do the add and check if there was overflow + auto ret = AddWithOverflowCheck(x, y, out); + + // if there is an overflow, switch to the AddLarge codepath. + return BuildIfElse(ret.overflow(), types()->i128_type(), + [&] { return AddLarge(x, y, out); }, + [&] { return ret.value(); }); + } else { + return AddLarge(x, y, out); + } + }; + auto value = + BuildIfElse(lt_max_precision, types()->i128_type(), then_lambda, else_lambda); + + // store result to out + ir_builder()->CreateRet(value); + return Status::OK(); +} + +Status DecimalIR::AddFunctions(Engine* engine) { + auto decimal_ir = std::make_shared(engine); + + // Populate global variables used by decimal operations. + decimal_ir->AddGlobals(engine); + + // Lookup intrinsic functions + decimal_ir->InitializeIntrinsics(); + + // build "add" + return decimal_ir->BuildAdd(); +} + +// Do an bitwise-or of all the overflow bits. +llvm::Value* DecimalIR::GetCombinedOverflow( + std::vector vec) { + llvm::Value* res = types()->false_constant(); + for (auto& val : vec) { + res = ir_builder()->CreateOr(res, val.overflow()); + } + return res; +} + +DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromInt128(DecimalIR* decimal_ir, + llvm::Value* in) { + auto builder = decimal_ir->ir_builder(); + auto types = decimal_ir->types(); + + auto high = builder->CreateLShr(in, types->i128_constant(64)); + high = builder->CreateTrunc(high, types->i64_type()); + auto low = builder->CreateTrunc(in, types->i64_type()); + return ValueSplit(high, low); +} + +/// Convert IR struct {%i64, %i64} to cpp class ValueSplit +DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromStruct(DecimalIR* decimal_ir, + llvm::Value* dstruct) { + auto builder = decimal_ir->ir_builder(); + auto high = builder->CreateExtractValue(dstruct, 0); + auto low = builder->CreateExtractValue(dstruct, 1); + return DecimalIR::ValueSplit(high, low); +} + +llvm::Value* DecimalIR::ValueSplit::AsInt128(DecimalIR* decimal_ir) const { + auto builder = decimal_ir->ir_builder(); + auto types = decimal_ir->types(); + + auto value = builder->CreateSExt(high_, types->i128_type()); + value = builder->CreateShl(value, types->i128_constant(64)); + value = builder->CreateAdd(value, builder->CreateZExt(low_, types->i128_type())); + return value; +} + +/// Convert IR struct {%i128, %i1} to cpp class ValueWithOverflow +DecimalIR::ValueWithOverflow DecimalIR::ValueWithOverflow::MakeFromStruct( + DecimalIR* decimal_ir, llvm::Value* dstruct) { + auto builder = decimal_ir->ir_builder(); + auto value = builder->CreateExtractValue(dstruct, 0); + auto overflow = builder->CreateExtractValue(dstruct, 1); + return DecimalIR::ValueWithOverflow(value, overflow); +} + +/// Convert to IR struct {%i128, %i1} +llvm::Value* DecimalIR::ValueWithOverflow::AsStruct(DecimalIR* decimal_ir) const { + auto builder = decimal_ir->ir_builder(); + + auto undef = llvm::UndefValue::get(decimal_ir->i128_with_overflow_struct_type_); + auto struct_val = builder->CreateInsertValue(undef, value(), 0); + return builder->CreateInsertValue(struct_val, overflow(), 1); +} + +/// debug traces +void DecimalIR::AddTrace(const std::string& fmt, std::vector args) { + DCHECK(enable_ir_traces_); + + auto ir_str = ir_builder()->CreateGlobalStringPtr(fmt); + args.insert(args.begin(), ir_str); + ir_builder()->CreateCall(module()->getFunction("printf"), args, "trace"); +} + +void DecimalIR::AddTrace32(const std::string& msg, llvm::Value* value) { + AddTrace("DECIMAL_IR_TRACE:: " + msg + " %d\n", {value}); +} + +void DecimalIR::AddTrace128(const std::string& msg, llvm::Value* value) { + // convert i128 into two i64s for printing + auto split = ValueSplit::MakeFromInt128(this, value); + AddTrace("DECIMAL_IR_TRACE:: " + msg + " %llx:%llx (%lld:%llu)\n", + {split.high(), split.low(), split.high(), split.low()}); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/decimal_ir.h b/cpp/src/gandiva/decimal_ir.h new file mode 100644 index 0000000000000..fae762c362d94 --- /dev/null +++ b/cpp/src/gandiva/decimal_ir.h @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_DECIMAL_ADD_IR_BUILDER_H +#define GANDIVA_DECIMAL_ADD_IR_BUILDER_H + +#include +#include +#include + +#include "gandiva/function_ir_builder.h" + +namespace gandiva { + +/// @brief Decimal IR functions +class DecimalIR : public FunctionIRBuilder { + public: + explicit DecimalIR(Engine* engine) + : FunctionIRBuilder(engine), enable_ir_traces_(false) {} + + /// Build decimal IR functions and add them to the engine. + static Status AddFunctions(Engine* engine); + + void EnableTraces() { enable_ir_traces_ = true; } + + private: + /// The intrinsic fn for divide with small divisors is about 10x slower, so not + /// using these. + static const bool kUseOverflowIntrinsics = false; + + // Holder for an i128 value, along with its with scale and precision. + class ValueFull { + public: + ValueFull(llvm::Value* value, llvm::Value* precision, llvm::Value* scale) + : value_(value), precision_(precision), scale_(scale) {} + + llvm::Value* value() const { return value_; } + llvm::Value* precision() const { return precision_; } + llvm::Value* scale() const { return scale_; } + + private: + llvm::Value* value_; + llvm::Value* precision_; + llvm::Value* scale_; + }; + + // Holder for an i128 value, and a boolean indicating overflow. + class ValueWithOverflow { + public: + ValueWithOverflow(llvm::Value* value, llvm::Value* overflow) + : value_(value), overflow_(overflow) {} + + // Make from IR struct + static ValueWithOverflow MakeFromStruct(DecimalIR* decimal_ir, llvm::Value* dstruct); + + // Build a corresponding IR struct + llvm::Value* AsStruct(DecimalIR* decimal_ir) const; + + llvm::Value* value() const { return value_; } + llvm::Value* overflow() const { return overflow_; } + + private: + llvm::Value* value_; + llvm::Value* overflow_; + }; + + // Holder for an i128 value that is split into two i64s + class ValueSplit { + public: + ValueSplit(llvm::Value* high, llvm::Value* low) : high_(high), low_(low) {} + + // Make from i128 value + static ValueSplit MakeFromInt128(DecimalIR* decimal_ir, llvm::Value* in); + + // Make from IR struct + static ValueSplit MakeFromStruct(DecimalIR* decimal_ir, llvm::Value* dstruct); + + // Combine the two parts into an i128 + llvm::Value* AsInt128(DecimalIR* decimal_ir) const; + + llvm::Value* high() const { return high_; } + llvm::Value* low() const { return low_; } + + private: + llvm::Value* high_; + llvm::Value* low_; + }; + + // Add global variables to the module. + static void AddGlobals(Engine* engine); + + // Initialize intrinsic functions that are used by decimal operations. + void InitializeIntrinsics(); + + // Create IR builder for decimal add function. + static Status MakeAdd(Engine* engine, std::shared_ptr* out); + + // Get the multiplier for specified scale (i.e 10^scale) + llvm::Value* GetScaleMultiplier(llvm::Value* scale); + + // Get the higher of the two scales + llvm::Value* GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale); + + // Increase scale of 'in_value' by 'increase_scale_by'. + // - If 'increase_scale_by' is <= 0, does nothing. + llvm::Value* IncreaseScale(llvm::Value* in_value, llvm::Value* increase_scale_by); + + // Similar to IncreaseScale. but, also check if there is overflow. + ValueWithOverflow IncreaseScaleWithOverflowCheck(llvm::Value* in_value, + llvm::Value* increase_scale_by); + + // Reduce scale of 'in_value' by 'reduce_scale_by'. + // - If 'reduce_scale_by' is <= 0, does nothing. + llvm::Value* ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by); + + // Fast path of add: guaranteed no overflow + llvm::Value* AddFastPath(const ValueFull& x, const ValueFull& y); + + // Similar to AddFastPath, but check if there's an overflow. + ValueWithOverflow AddWithOverflowCheck(const ValueFull& x, const ValueFull& y, + const ValueFull& out); + + // Do addition of large integers (both positive and negative). + llvm::Value* AddLarge(const ValueFull& x, const ValueFull& y, const ValueFull& out); + + // Get the combined overflow (logical or). + llvm::Value* GetCombinedOverflow(std::vector values); + + // Build the function for adding decimals. + Status BuildAdd(); + + // Add a trace in IR code. + void AddTrace(const std::string& fmt, std::vector args); + + // Add a trace msg along with a 32-bit integer. + void AddTrace32(const std::string& msg, llvm::Value* value); + + // Add a trace msg along with a 128-bit integer. + void AddTrace128(const std::string& msg, llvm::Value* value); + + // name of the global variable having the array of scale multipliers. + static const char* kScaleMultipliersName; + + // Intrinsic functions + llvm::Function* sadd_with_overflow_fn_; + llvm::Function* smul_with_overflow_fn_; + + // struct { i128: value, i1: overflow} + llvm::Type* i128_with_overflow_struct_type_; + + // if set to true, ir traces are enabled. Useful for debugging. + bool enable_ir_traces_; +}; + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_IR_BUILDER_H diff --git a/cpp/src/gandiva/decimal_scalar.h b/cpp/src/gandiva/decimal_scalar.h new file mode 100644 index 0000000000000..5b38770da632a --- /dev/null +++ b/cpp/src/gandiva/decimal_scalar.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License + +#pragma once + +#include +#include +#include +#include "arrow/util/decimal.h" +#include "gandiva/basic_decimal_scalar.h" + +namespace gandiva { + +using Decimal128 = arrow::Decimal128; + +/// Represents a 128-bit decimal value along with its precision and scale. +/// +/// BasicDecimalScalar128 can be safely compiled to IR without references to libstdc++. +/// This class has additional functionality on top of BasicDecimalScalar128 to deal with +/// strings and streams. +class DecimalScalar128 : public BasicDecimalScalar128 { + public: + using BasicDecimalScalar128::BasicDecimalScalar128; + + DecimalScalar128(const std::string& value, int32_t precision, int32_t scale) + : BasicDecimalScalar128(Decimal128(value), precision, scale) {} + + inline std::string ToString() const { + Decimal128 dvalue(value()); + return dvalue.ToString(0) + "," + std::to_string(precision()) + "," + + std::to_string(scale()); + } + + friend std::ostream& operator<<(std::ostream& os, const DecimalScalar128& dec) { + os << dec.ToString(); + return os; + } +}; + +} // namespace gandiva diff --git a/cpp/src/gandiva/decimal_type_util.cc b/cpp/src/gandiva/decimal_type_util.cc new file mode 100644 index 0000000000000..2795e913a9484 --- /dev/null +++ b/cpp/src/gandiva/decimal_type_util.cc @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/decimal_type_util.h" +#include "gandiva/logging.h" + +namespace gandiva { + +constexpr int32_t DecimalTypeUtil::kMaxDecimal32Precision; +constexpr int32_t DecimalTypeUtil::kMaxDecimal64Precision; +constexpr int32_t DecimalTypeUtil::kMaxPrecision; + +constexpr int32_t DecimalTypeUtil::kMaxScale; +constexpr int32_t DecimalTypeUtil::kMinAdjustedScale; + +#define DCHECK_TYPE(type) \ + { \ + DCHECK_GE(type->scale(), 0); \ + DCHECK_LE(type->precision(), kMaxPrecision); \ + } + +// Implementation of decimal rules. +Status DecimalTypeUtil::GetResultType(Op op, const Decimal128TypeVector& in_types, + Decimal128TypePtr* out_type) { + DCHECK_EQ(in_types.size(), 2); + + *out_type = nullptr; + auto t1 = in_types[0]; + auto t2 = in_types[1]; + DCHECK_TYPE(t1); + DCHECK_TYPE(t2); + + int32_t s1 = t1->scale(); + int32_t s2 = t2->scale(); + int32_t p1 = t1->precision(); + int32_t p2 = t2->precision(); + int32_t result_scale = 0; + int32_t result_precision = 0; + + switch (op) { + case kOpAdd: + case kOpSubtract: + result_scale = std::max(s1, s2); + result_precision = std::max(p1 - s1, p2 - s2) + result_scale + 1; + break; + + case kOpMultiply: + result_scale = s1 + s2; + result_precision = p1 + p2 + 1; + break; + + case kOpDivide: + result_scale = std::max(kMinAdjustedScale, s1 + p2 + 1); + result_precision = p1 - s1 + s2 + result_scale; + break; + + case kOpMod: + result_scale = std::max(s1, s2); + result_precision = std::min(p1 - s1, p2 - s2) + result_scale; + break; + } + *out_type = MakeAdjustedType(result_precision, result_scale); + return Status::OK(); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/decimal_type_util.h b/cpp/src/gandiva/decimal_type_util.h new file mode 100644 index 0000000000000..2c095c159bba0 --- /dev/null +++ b/cpp/src/gandiva/decimal_type_util.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Adapted from Apache Impala + +#ifndef GANDIVA_DECIMAL_TYPE_SQL_H +#define GANDIVA_DECIMAL_TYPE_SQL_H + +#include +#include + +#include "gandiva/arrow.h" + +namespace gandiva { + +/// @brief Handles conversion of scale/precision for operations on decimal types. +/// TODO : do validations for all of these. +class DecimalTypeUtil { + public: + enum Op { + kOpAdd, + kOpSubtract, + kOpMultiply, + kOpDivide, + kOpMod, + }; + + /// The maximum precision representable by a 4-byte decimal + static constexpr int32_t kMaxDecimal32Precision = 9; + + /// The maximum precision representable by a 8-byte decimal + static constexpr int32_t kMaxDecimal64Precision = 18; + + /// The maximum precision representable by a 16-byte decimal + static constexpr int32_t kMaxPrecision = 38; + + // The maximum scale representable. + static constexpr int32_t kMaxScale = kMaxPrecision; + + // When operating on decimal inputs, the integer part of the output can exceed the + // max precision. In such cases, the scale can be reduced, upto a minimum of + // kMinAdjustedScale. + // * There is no strong reason for 6, but both SQLServer and Impala use 6 too. + static constexpr int32_t kMinAdjustedScale = 6; + + // For specified operation and input scale/precision, determine the output + // scale/precision. + static Status GetResultType(Op op, const Decimal128TypeVector& in_types, + Decimal128TypePtr* out_type); + + static Decimal128TypePtr MakeType(int32_t precision, int32_t scale); + + private: + static Decimal128TypePtr MakeAdjustedType(int32_t precision, int32_t scale); +}; + +inline Decimal128TypePtr DecimalTypeUtil::MakeType(int32_t precision, int32_t scale) { + return std::dynamic_pointer_cast( + arrow::decimal(precision, scale)); +} + +// Reduce the scale if possible so that precision stays <= kMaxPrecision +inline Decimal128TypePtr DecimalTypeUtil::MakeAdjustedType(int32_t precision, + int32_t scale) { + if (precision > kMaxPrecision) { + int32_t min_scale = std::min(scale, kMinAdjustedScale); + int32_t delta = precision - kMaxPrecision; + precision = kMaxPrecision; + scale = std::max(scale - delta, min_scale); + } + return MakeType(precision, scale); +} + +} // namespace gandiva + +#endif // GANDIVA_DECIMAL_TYPE_SQL_H diff --git a/cpp/src/gandiva/decimal_type_util_test.cc b/cpp/src/gandiva/decimal_type_util_test.cc new file mode 100644 index 0000000000000..a593990638af5 --- /dev/null +++ b/cpp/src/gandiva/decimal_type_util_test.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Adapted from Apache Impala + +#include + +#include "gandiva/decimal_type_util.h" +#include "tests/test_util.h" + +namespace gandiva { + +#define DECIMAL_TYPE(p, s) DecimalTypeUtil::MakeType(p, s) + +Decimal128TypePtr DoOp(DecimalTypeUtil::Op op, Decimal128TypePtr d1, + Decimal128TypePtr d2) { + Decimal128TypePtr ret_type; + EXPECT_OK(DecimalTypeUtil::GetResultType(op, {d1, d2}, &ret_type)); + return ret_type; +} + +TEST(DecimalResultTypes, Basic) { + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(31, 10), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 10), DECIMAL_TYPE(30, 10))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(32, 6), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 6), DECIMAL_TYPE(30, 5))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(38, 9), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 10), DECIMAL_TYPE(38, 10))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(38, 9), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(38, 10), DECIMAL_TYPE(38, 38))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(38, 6), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(38, 10), DECIMAL_TYPE(38, 2))); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 59884c5b4ad44..9aaafea8e498e 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -39,6 +39,7 @@ #include #include #include +#include "gandiva/decimal_ir.h" #include "gandiva/exported_funcs_registry.h" namespace gandiva { @@ -94,6 +95,10 @@ Status Engine::Make(std::shared_ptr config, auto status = engine_obj->LoadPreCompiledIRFiles(config->byte_code_file_path()); ARROW_RETURN_NOT_OK(status); + // Add decimal functions + status = DecimalIR::AddFunctions(engine_obj.get()); + ARROW_RETURN_NOT_OK(status); + *engine = std::move(engine_obj); return Status::OK(); } @@ -103,12 +108,11 @@ Status Engine::LoadPreCompiledIRFiles(const std::string& byte_code_file_path) { /// Read from file into memory buffer. llvm::ErrorOr> buffer_or_error = llvm::MemoryBuffer::getFile(byte_code_file_path); - if (!buffer_or_error) { - std::stringstream ss; - ss << "Could not load module from IR " << byte_code_file_path << ": " - << buffer_or_error.getError().message(); - return Status::CodeGenError(ss.str()); - } + ARROW_RETURN_IF( + !buffer_or_error, + Status::CodeGenError("Could not load module from IR ", byte_code_file_path, ": ", + buffer_or_error.getError().message())); + std::unique_ptr buffer = move(buffer_or_error.get()); /// Parse the IR module. @@ -123,15 +127,11 @@ Status Engine::LoadPreCompiledIRFiles(const std::string& byte_code_file_path) { } std::unique_ptr ir_module = move(module_or_error.get()); - /// Verify the IR module - if (llvm::verifyModule(*ir_module, &llvm::errs())) { - return Status::CodeGenError("verify of IR Module failed"); - } + ARROW_RETURN_IF(llvm::verifyModule(*ir_module, &llvm::errs()), + Status::CodeGenError("verify of IR Module failed")); + ARROW_RETURN_IF(llvm::Linker::linkModules(*module_, move(ir_module)), + Status::CodeGenError("failed to link IR Modules")); - // Link this to the primary module. - if (llvm::Linker::linkModules(*module_, move(ir_module))) { - return Status::CodeGenError("failed to link IR Modules"); - } return Status::OK(); } @@ -188,7 +188,7 @@ Status Engine::FinalizeModule(bool optimise_ir, bool dump_ir) { // run the optimiser llvm::PassManagerBuilder pass_builder; - pass_builder.OptLevel = 2; + pass_builder.OptLevel = 3; pass_builder.populateModulePassManager(*pass_manager); pass_manager->run(*module_); @@ -197,13 +197,13 @@ Status Engine::FinalizeModule(bool optimise_ir, bool dump_ir) { } } - if (llvm::verifyModule(*module_, &llvm::errs())) { - return Status::CodeGenError("verify of module failed after optimisation passes"); - } + ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), + Status::CodeGenError("Module verification failed after optimizer")); // do the compilation execution_engine_->finalizeObject(); module_finalized_ = true; + return Status::OK(); } @@ -227,7 +227,7 @@ void Engine::DumpIR(std::string prefix) { std::string str; llvm::raw_string_ostream stream(str); - module_->print(stream, NULL); + module_->print(stream, nullptr); std::cout << "====" << prefix << "===" << str << "\n"; } diff --git a/cpp/src/gandiva/engine.h b/cpp/src/gandiva/engine.h index f377ebc38d3ef..12480148bf422 100644 --- a/cpp/src/gandiva/engine.h +++ b/cpp/src/gandiva/engine.h @@ -37,6 +37,8 @@ namespace gandiva { +class FunctionIRBuilder; + /// \brief LLVM Execution engine wrapper. class Engine { public: @@ -45,9 +47,10 @@ class Engine { LLVMTypes* types() { return types_.get(); } llvm::Module* module() { return module_; } - /// factory method to create and initialize the engine object. + /// Factory method to create and initialize the engine object. /// - /// \param[out] engine the created engine. + /// \param[in] config the engine configuration + /// \param[out] engine the created engine static Status Make(std::shared_ptr config, std::unique_ptr* engine); diff --git a/cpp/src/gandiva/engine_llvm_test.cc b/cpp/src/gandiva/engine_llvm_test.cc index fe4f82e19320c..627c385f97363 100644 --- a/cpp/src/gandiva/engine_llvm_test.cc +++ b/cpp/src/gandiva/engine_llvm_test.cc @@ -19,6 +19,7 @@ #include #include "gandiva/llvm_types.h" +#include "gandiva/tests/test_util.h" namespace gandiva { @@ -100,7 +101,7 @@ llvm::Function* TestEngine::BuildVecAdd(Engine* engine, LLVMTypes* types) { TEST_F(TestEngine, TestAddUnoptimised) { std::unique_ptr engine; - Status status = Engine::Make(ConfigurationBuilder::DefaultConfiguration(), &engine); + auto status = Engine::Make(TestConfiguration(), &engine); EXPECT_TRUE(status.ok()) << status.message(); LLVMTypes types(*engine->context()); llvm::Function* ir_func = BuildVecAdd(engine.get(), &types); @@ -115,7 +116,7 @@ TEST_F(TestEngine, TestAddUnoptimised) { TEST_F(TestEngine, TestAddOptimised) { std::unique_ptr engine; - Status status = Engine::Make(ConfigurationBuilder::DefaultConfiguration(), &engine); + auto status = Engine::Make(TestConfiguration(), &engine); EXPECT_TRUE(status.ok()) << status.message(); LLVMTypes types(*engine->context()); llvm::Function* ir_func = BuildVecAdd(engine.get(), &types); diff --git a/cpp/src/gandiva/eval_batch.h b/cpp/src/gandiva/eval_batch.h index 608f4200ce415..093968f232afb 100644 --- a/cpp/src/gandiva/eval_batch.h +++ b/cpp/src/gandiva/eval_batch.h @@ -85,7 +85,7 @@ class EvalBatch { /// An array of 'num_buffers_', each containing a buffer. The buffer /// sizes depends on the data type, but all of them have the same /// number of slots (equal to num_records_). - std::unique_ptr buffers_array_; + std::unique_ptr buffers_array_; std::unique_ptr local_bitmaps_holder_; diff --git a/cpp/src/gandiva/exported_funcs_registry.h b/cpp/src/gandiva/exported_funcs_registry.h index 511ec9c212468..35ad5c0fae516 100644 --- a/cpp/src/gandiva/exported_funcs_registry.h +++ b/cpp/src/gandiva/exported_funcs_registry.h @@ -18,6 +18,7 @@ #ifndef GANDIVA_EXPORTED_FUNCS_REGISTRY_H #define GANDIVA_EXPORTED_FUNCS_REGISTRY_H +#include #include #include @@ -30,12 +31,12 @@ class ExportedFuncsBase; /// LLVM/IR code. class ExportedFuncsRegistry { public: - using list_type = std::vector; + using list_type = std::vector>; // Add functions from all the registered classes to the engine. static void AddMappings(Engine* engine); - static bool Register(ExportedFuncsBase* entry) { + static bool Register(std::shared_ptr entry) { registered().push_back(entry); return true; } @@ -48,7 +49,8 @@ class ExportedFuncsRegistry { }; #define REGISTER_EXPORTED_FUNCS(classname) \ - static bool _registered_##classname = ExportedFuncsRegistry::Register(new classname) + static bool _registered_##classname = \ + ExportedFuncsRegistry::Register(std::make_shared()) } // namespace gandiva diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc index 3f5d63745f942..43de9d7a053f8 100644 --- a/cpp/src/gandiva/expr_validator.cc +++ b/cpp/src/gandiva/expr_validator.cc @@ -24,133 +24,114 @@ namespace gandiva { Status ExprValidator::Validate(const ExpressionPtr& expr) { - if (expr == nullptr) { - return Status::ExpressionValidationError("Expression cannot be null."); - } + ARROW_RETURN_IF(expr == nullptr, + Status::ExpressionValidationError("Expression cannot be null")); + Node& root = *expr->root(); - Status status = root.Accept(*this); - if (!status.ok()) { - return status; - } - // validate return type matches - // no need to check if type is supported - // since root type has been validated. - if (!root.return_type()->Equals(*expr->result()->type())) { - std::stringstream ss; - ss << "Return type of root node " << root.return_type()->name() - << " does not match that of expression " << *expr->result()->type(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_NOT_OK(root.Accept(*this)); + + // Ensure root's return type match the expression return type. Type + // support validation is not required because root type is already supported. + ARROW_RETURN_IF(!root.return_type()->Equals(*expr->result()->type()), + Status::ExpressionValidationError("Return type of root node ", + root.return_type()->name(), + " does not match that of expression ", + expr->result()->type()->name())); + return Status::OK(); } Status ExprValidator::Visit(const FieldNode& node) { auto llvm_type = types_->IRType(node.return_type()->id()); - if (llvm_type == nullptr) { - std::stringstream ss; - ss << "Field " << node.field()->name() << " has unsupported data type " - << node.return_type()->name(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Field ", node.field()->name(), + " has unsupported data type ", + node.return_type()->name())); + // Ensure that field is found in schema auto field_in_schema_entry = field_map_.find(node.field()->name()); + ARROW_RETURN_IF(field_in_schema_entry == field_map_.end(), + Status::ExpressionValidationError("Field ", node.field()->name(), + " not in schema.")); - // validate that field is in schema. - if (field_in_schema_entry == field_map_.end()) { - std::stringstream ss; - ss << "Field " << node.field()->name() << " not in schema."; - return Status::ExpressionValidationError(ss.str()); - } - + // Ensure that that the found field match. FieldPtr field_in_schema = field_in_schema_entry->second; - // validate that field matches the definition in schema. - if (!field_in_schema->Equals(node.field())) { - std::stringstream ss; - ss << "Field definition in schema " << field_in_schema->ToString() - << " different from field in expression " << node.field()->ToString(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(!field_in_schema->Equals(node.field()), + Status::ExpressionValidationError( + "Field definition in schema ", field_in_schema->ToString(), + " different from field in expression ", node.field()->ToString())); + return Status::OK(); } Status ExprValidator::Visit(const FunctionNode& node) { auto desc = node.descriptor(); FunctionSignature signature(desc->name(), desc->params(), desc->return_type()); + const NativeFunction* native_function = registry_.LookupSignature(signature); - if (native_function == nullptr) { - std::stringstream ss; - ss << "Function " << signature.ToString() << " not supported yet. "; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(native_function == nullptr, + Status::ExpressionValidationError("Function ", signature.ToString(), + " not supported yet. ")); for (auto& child : node.children()) { - Status status = child->Accept(*this); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(child->Accept(*this)); } + return Status::OK(); } Status ExprValidator::Visit(const IfNode& node) { - Status status = node.condition()->Accept(*this); - ARROW_RETURN_NOT_OK(status); - status = node.then_node()->Accept(*this); - ARROW_RETURN_NOT_OK(status); - status = node.else_node()->Accept(*this); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(node.condition()->Accept(*this)); + ARROW_RETURN_NOT_OK(node.then_node()->Accept(*this)); + ARROW_RETURN_NOT_OK(node.else_node()->Accept(*this)); auto if_node_ret_type = node.return_type(); auto then_node_ret_type = node.then_node()->return_type(); auto else_node_ret_type = node.else_node()->return_type(); - if (!if_node_ret_type->Equals(*then_node_ret_type)) { - std::stringstream ss; - ss << "Return type of if " << *if_node_ret_type << " and then " << *then_node_ret_type - << " not matching."; - return Status::ExpressionValidationError(ss.str()); - } + // Then-branch return type must match. + ARROW_RETURN_IF(!if_node_ret_type->Equals(*then_node_ret_type), + Status::ExpressionValidationError( + "Return type of if ", if_node_ret_type->ToString(), " and then ", + then_node_ret_type->ToString(), " not matching.")); - if (!if_node_ret_type->Equals(*else_node_ret_type)) { - std::stringstream ss; - ss << "Return type of if " << *if_node_ret_type << " and else " << *else_node_ret_type - << " not matching."; - return Status::ExpressionValidationError(ss.str()); - } + // Else-branch return type must match. + ARROW_RETURN_IF(!if_node_ret_type->Equals(*else_node_ret_type), + Status::ExpressionValidationError( + "Return type of if ", if_node_ret_type->ToString(), " and else ", + else_node_ret_type->ToString(), " not matching.")); return Status::OK(); } Status ExprValidator::Visit(const LiteralNode& node) { auto llvm_type = types_->IRType(node.return_type()->id()); - if (llvm_type == nullptr) { - std::stringstream ss; - ss << "Value " << node.holder() << " has unsupported data type " - << node.return_type()->name(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Value ", node.holder(), + " has unsupported data type ", + node.return_type()->name())); + return Status::OK(); } Status ExprValidator::Visit(const BooleanNode& node) { - Status status; - - if (node.children().size() < 2) { - std::stringstream ss; - ss << "Boolean expression has " << node.children().size() - << " children, expected atleast two"; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF( + node.children().size() < 2, + Status::ExpressionValidationError("Boolean expression has ", node.children().size(), + " children, expected atleast two")); for (auto& child : node.children()) { - if (!child->return_type()->Equals(arrow::boolean())) { - std::stringstream ss; - ss << "Boolean expression has a child with return type " - << child->return_type()->name() << ", expected return type boolean"; - return Status::ExpressionValidationError(ss.str()); - } - - status = child->Accept(*this); - ARROW_RETURN_NOT_OK(status); + const auto bool_type = arrow::boolean(); + const auto ret_type = child->return_type(); + + ARROW_RETURN_IF(!ret_type->Equals(bool_type), + Status::ExpressionValidationError( + "Boolean expression has a child with return type ", + ret_type->ToString(), ", expected return type boolean")); + + ARROW_RETURN_NOT_OK(child->Accept(*this)); } + return Status::OK(); } @@ -178,18 +159,13 @@ Status ExprValidator::Visit(const InExpressionNode& node) { Status ExprValidator::ValidateInExpression(size_t number_of_values, DataTypePtr in_expr_return_type, DataTypePtr type_of_values) { - if (static_cast(number_of_values) == 0) { - std::stringstream ss; - ss << "IN Expression needs a non-empty constant list to match."; - return Status::ExpressionValidationError(ss.str()); - } - - if (!in_expr_return_type->Equals(type_of_values)) { - std::stringstream ss; - ss << "Evaluation expression for IN clause returns " << in_expr_return_type - << " values are of type" << type_of_values; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(number_of_values == 0, + Status::ExpressionValidationError( + "IN Expression needs a non-empty constant list to match.")); + ARROW_RETURN_IF(!in_expr_return_type->Equals(type_of_values), + Status::ExpressionValidationError( + "Evaluation expression for IN clause returns ", in_expr_return_type, + " values are of type", type_of_values)); return Status::OK(); } diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index fb5a45e779926..1a087c96f33bd 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -136,10 +136,12 @@ void ExpressionRegistry::AddArrowTypesToVector(arrow::Type::type& type, case arrow::Type::type::NA: vector.push_back(arrow::null()); break; + case arrow::Type::type::DECIMAL: + vector.push_back(arrow::decimal(0, 0)); + break; case arrow::Type::type::FIXED_SIZE_BINARY: case arrow::Type::type::MAP: case arrow::Type::type::INTERVAL: - case arrow::Type::type::DECIMAL: case arrow::Type::type::LIST: case arrow::Type::type::STRUCT: case arrow::Type::type::UNION: diff --git a/cpp/src/gandiva/filter.cc b/cpp/src/gandiva/filter.cc index 7a24d9554ef3f..6075e2574559b 100644 --- a/cpp/src/gandiva/filter.cc +++ b/cpp/src/gandiva/filter.cc @@ -40,32 +40,28 @@ Filter::Filter(std::unique_ptr llvm_generator, SchemaPtr schema, Status Filter::Make(SchemaPtr schema, ConditionPtr condition, std::shared_ptr configuration, std::shared_ptr* filter) { - ARROW_RETURN_FAILURE_IF_FALSE(schema != nullptr, - Status::Invalid("schema cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(condition != nullptr, - Status::Invalid("condition cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(configuration != nullptr, - Status::Invalid("configuration cannot be null")); + ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null")); + ARROW_RETURN_IF(condition == nullptr, Status::Invalid("Condition cannot be null")); + ARROW_RETURN_IF(configuration == nullptr, + Status::Invalid("Configuration cannot be null")); + static Cache> cache; FilterCacheKey cache_key(schema, configuration, *(condition.get())); - std::shared_ptr cachedFilter = cache.GetModule(cache_key); + auto cachedFilter = cache.GetModule(cache_key); if (cachedFilter != nullptr) { *filter = cachedFilter; return Status::OK(); } + // Build LLVM generator, and generate code for the specified expression std::unique_ptr llvm_gen; - Status status = LLVMGenerator::Make(configuration, &llvm_gen); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen)); // Run the validation on the expression. // Return if the expression is invalid since we will not be able to process further. ExprValidator expr_validator(llvm_gen->types(), schema); - status = expr_validator.Validate(condition); - ARROW_RETURN_NOT_OK(status); - - status = llvm_gen->Build({condition}); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(expr_validator.Validate(condition)); + ARROW_RETURN_NOT_OK(llvm_gen->Build({condition})); // Instantiate the filter with the completely built llvm generator *filter = std::make_shared(std::move(llvm_gen), schema, configuration); @@ -76,42 +72,33 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, Status Filter::Evaluate(const arrow::RecordBatch& batch, std::shared_ptr out_selection) { - if (!batch.schema()->Equals(*schema_)) { - return Status::Invalid("Schema in RecordBatch must match the schema in Make()"); - } - if (batch.num_rows() == 0) { - return Status::Invalid("RecordBatch must be non-empty."); - } - if (out_selection == nullptr) { - return Status::Invalid("out_selection must be non-null."); - } - if (out_selection->GetMaxSlots() < batch.num_rows()) { - std::stringstream ss; - ss << "out_selection has " << out_selection->GetMaxSlots() - << " slots, which is less than the batch size " << batch.num_rows(); - return Status::Invalid(ss.str()); - } + const auto num_rows = batch.num_rows(); + ARROW_RETURN_IF(!batch.schema()->Equals(*schema_), + Status::Invalid("RecordBatch schema must expected filter schema")); + ARROW_RETURN_IF(num_rows == 0, Status::Invalid("RecordBatch must be non-empty.")); + ARROW_RETURN_IF(out_selection == nullptr, + Status::Invalid("out_selection must be non-null.")); + ARROW_RETURN_IF(out_selection->GetMaxSlots() < num_rows, + Status::Invalid("Output selection vector capacity too small")); // Allocate three local_bitmaps (one for output, one for validity, one to compute the // intersection). - LocalBitMapsHolder bitmaps(batch.num_rows(), 3 /*local_bitmaps*/); + LocalBitMapsHolder bitmaps(num_rows, 3 /*local_bitmaps*/); int64_t bitmap_size = bitmaps.GetLocalBitMapSize(); auto validity = std::make_shared(bitmaps.GetLocalBitMap(0), bitmap_size); auto value = std::make_shared(bitmaps.GetLocalBitMap(1), bitmap_size); - auto array_data = - arrow::ArrayData::Make(arrow::boolean(), batch.num_rows(), {validity, value}); + auto array_data = arrow::ArrayData::Make(arrow::boolean(), num_rows, {validity, value}); // Execute the expression(s). - auto status = llvm_generator_->Execute(batch, {array_data}); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, {array_data})); // Compute the intersection of the value and validity. auto result = bitmaps.GetLocalBitMap(2); BitMapAccumulator::IntersectBitMaps( - result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, batch.num_rows()); + result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, num_rows); - return out_selection->PopulateFromBitMap(result, bitmap_size, batch.num_rows() - 1); + return out_selection->PopulateFromBitMap(result, bitmap_size, num_rows - 1); } } // namespace gandiva diff --git a/cpp/src/gandiva/function_ir_builder.cc b/cpp/src/gandiva/function_ir_builder.cc new file mode 100644 index 0000000000000..194273933cd15 --- /dev/null +++ b/cpp/src/gandiva/function_ir_builder.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_ir_builder.h" + +namespace gandiva { + +llvm::Value* FunctionIRBuilder::BuildIfElse(llvm::Value* condition, + llvm::Type* return_type, + std::function then_func, + std::function else_func) { + llvm::IRBuilder<>* builder = ir_builder(); + llvm::Function* function = builder->GetInsertBlock()->getParent(); + DCHECK_NE(function, nullptr); + + // Create blocks for the then, else and merge cases. + llvm::BasicBlock* then_bb = llvm::BasicBlock::Create(*context(), "then", function); + llvm::BasicBlock* else_bb = llvm::BasicBlock::Create(*context(), "else", function); + llvm::BasicBlock* merge_bb = llvm::BasicBlock::Create(*context(), "merge", function); + + builder->CreateCondBr(condition, then_bb, else_bb); + + // Emit the then block. + builder->SetInsertPoint(then_bb); + auto then_value = then_func(); + builder->CreateBr(merge_bb); + + // refresh then_bb for phi (could have changed due to code generation of then_value). + then_bb = builder->GetInsertBlock(); + + // Emit the else block. + builder->SetInsertPoint(else_bb); + auto else_value = else_func(); + builder->CreateBr(merge_bb); + + // refresh else_bb for phi (could have changed due to code generation of else_value). + else_bb = builder->GetInsertBlock(); + + // Emit the merge block. + builder->SetInsertPoint(merge_bb); + llvm::PHINode* result_value = builder->CreatePHI(return_type, 2, "res_value"); + result_value->addIncoming(then_value, then_bb); + result_value->addIncoming(else_value, else_bb); + return result_value; +} + +llvm::Function* FunctionIRBuilder::BuildFunction(const std::string& function_name, + llvm::Type* return_type, + std::vector in_args) { + std::vector arg_types; + for (auto& arg : in_args) { + arg_types.push_back(arg.type); + } + auto prototype = llvm::FunctionType::get(return_type, arg_types, false /*isVarArg*/); + auto function = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, + function_name, module()); + + uint32_t i = 0; + for (auto& fn_arg : function->args()) { + DCHECK_LT(i, in_args.size()); + fn_arg.setName(in_args[i].name); + ++i; + } + return function; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_ir_builder.h b/cpp/src/gandiva/function_ir_builder.h new file mode 100644 index 0000000000000..7d6003a62d5bf --- /dev/null +++ b/cpp/src/gandiva/function_ir_builder.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_IR_BUILDER_H +#define GANDIVA_FUNCTION_IR_BUILDER_H + +#include +#include +#include +#include + +#include "gandiva/engine.h" +#include "gandiva/gandiva_aliases.h" +#include "gandiva/llvm_types.h" + +namespace gandiva { + +/// @brief Base class for building IR functions. +class FunctionIRBuilder { + public: + explicit FunctionIRBuilder(Engine* engine) : engine_(engine) {} + virtual ~FunctionIRBuilder() = default; + + protected: + LLVMTypes* types() { return engine_->types(); } + llvm::Module* module() { return engine_->module(); } + llvm::LLVMContext* context() { return engine_->context(); } + llvm::IRBuilder<>* ir_builder() { return engine_->ir_builder(); } + + /// Build an if-else block. + llvm::Value* BuildIfElse(llvm::Value* condition, llvm::Type* return_type, + std::function then_func, + std::function else_func); + + struct NamedArg { + std::string name; + llvm::Type* type; + }; + + /// Build llvm fn. + llvm::Function* BuildFunction(const std::string& function_name, llvm::Type* return_type, + std::vector in_args); + + private: + Engine* engine_; +}; + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_IR_BUILDER_H diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 3928fbeb0edb3..452cb6339954c 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -16,450 +16,63 @@ // under the License. #include "gandiva/function_registry.h" - +#include "gandiva/function_registry_arithmetic.h" +#include "gandiva/function_registry_datetime.h" +#include "gandiva/function_registry_hash.h" +#include "gandiva/function_registry_math_ops.h" +#include "gandiva/function_registry_string.h" +#include "gandiva/function_registry_timestamp_arithmetic.h" + +#include +#include #include namespace gandiva { -using arrow::binary; -using arrow::boolean; -using arrow::date64; -using arrow::float32; -using arrow::float64; -using arrow::int16; -using arrow::int32; -using arrow::int64; -using arrow::int8; -using arrow::uint16; -using arrow::uint32; -using arrow::uint64; -using arrow::uint8; -using arrow::utf8; -using std::vector; - -#define STRINGIFY(a) #a - -// Binary functions that : -// - have the same input type for both params -// - output type is same as the input type -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 -#define BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), kResultNullIfNull, \ - STRINGIFY(NAME##_##TYPE##_##TYPE)) - -// Binary functions that : -// - have the same input type for both params -// - NULL handling is of type NULL_IINTERNAL -// - can return error. -// -// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 -#define BINARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(), \ - kResultNullIfNull, STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \ - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) - -#define BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(NAME, TYPE) \ - BINARY_UNSAFE_NULL_IF_NULL(NAME, TYPE, TYPE) - -// Binary functions that : -// - have different input types, or output type -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type names. eg. mod_int64_int32 -#define BINARY_GENERIC_SAFE_NULL_IF_NULL(NAME, IN_TYPE1, IN_TYPE2, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE1(), IN_TYPE2()}, OUT_TYPE(), \ - kResultNullIfNull, STRINGIFY(NAME##_##IN_TYPE1##_##IN_TYPE2)) - -// Binary functions that : -// - have the same input type -// - output type is boolean -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type names. -// eg. equal_int32_int32 -#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullIfNull, \ - STRINGIFY(NAME##_##TYPE##_##TYPE)) - -// Unary functions that : -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 -#define UNARY_SAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ - STRINGIFY(NAME##_##IN_TYPE)) - -// Unary functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. eg. isnull_int32 -#define UNARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, boolean(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE)) - -// Unary functions that : -// - NULL handling is of type NULL_INTERNAL -// -// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 -#define UNARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ - STRINGIFY(NAME##_##IN_TYPE), \ - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) - -// Binary functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type names, -// eg. is_distinct_from_int32_int32 -#define BINARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE##_##TYPE)) - -// Extract functions (used with data/time types) that : -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type name. eg. extractYear_date -#define EXTRACT_SAFE_NULL_IF_NULL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullIfNull, \ - STRINGIFY(NAME##_##TYPE)) - -// Hash32 functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32_int8 -#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE)) - -// Hash32 functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32_int8 -#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE)) - -// Hash32 functions with seed that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 -#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), kResultNullNever, \ - STRINGIFY(NAME##WithSeed_##TYPE)) - -// Hash64 functions with seed that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 -#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), kResultNullNever, \ - STRINGIFY(NAME##WithSeed_##TYPE)) - -// Iterate the inner macro over all numeric types -#define NUMERIC_TYPES(INNER, NAME) \ - INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \ - INNER(NAME, uint8), INNER(NAME, uint16), INNER(NAME, uint32), INNER(NAME, uint64), \ - INNER(NAME, float32), INNER(NAME, float64) - -// Iterate the inner macro over numeric and date/time types -#define NUMERIC_DATE_TYPES(INNER, NAME) \ - NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME) - -// Iterate the inner macro over all date types -#define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp) - -// Iterate the inner macro over all time types -#define TIME_TYPES(INNER, NAME) INNER(NAME, time32) - -// Iterate the inner macro over all data types -#define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary) - -// Iterate the inner macro over all numeric types, date types and bool type -#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ - NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean) - -// Iterate the inner macro over all numeric types, date types, bool and varlen types -#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \ - NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME) - -// list of registered native functions. -NativeFunction FunctionRegistry::pc_registry_[] = { - // Arithmetic operations - NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, add), - NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, subtract), - NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, multiply), - NUMERIC_TYPES(BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL, divide), - BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int32, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int64, int64), - NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, equal), - NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, not_equal), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than_or_equal_to), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than_or_equal_to), - UNARY_SAFE_NULL_IF_NULL(not, boolean, boolean), - - // cast operations - UNARY_SAFE_NULL_IF_NULL(castBIGINT, int32, int64), - UNARY_SAFE_NULL_IF_NULL(castFLOAT4, int32, float32), - UNARY_SAFE_NULL_IF_NULL(castFLOAT4, int64, float32), - UNARY_SAFE_NULL_IF_NULL(castFLOAT8, int32, float64), - UNARY_SAFE_NULL_IF_NULL(castFLOAT8, int64, float64), - UNARY_SAFE_NULL_IF_NULL(castFLOAT8, float32, float64), - UNARY_SAFE_NULL_IF_NULL(castDATE, int64, date64), - - // extended math ops - UNARY_SAFE_NULL_IF_NULL(cbrt, int32, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, int64, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, float32, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, float64, float64), - - UNARY_SAFE_NULL_IF_NULL(exp, int32, float64), - UNARY_SAFE_NULL_IF_NULL(exp, int64, float64), - UNARY_SAFE_NULL_IF_NULL(exp, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(exp, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(exp, float32, float64), - UNARY_SAFE_NULL_IF_NULL(exp, float64, float64), - - UNARY_SAFE_NULL_IF_NULL(log, int32, float64), - UNARY_SAFE_NULL_IF_NULL(log, int64, float64), - UNARY_SAFE_NULL_IF_NULL(log, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(log, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(log, float32, float64), - UNARY_SAFE_NULL_IF_NULL(log, float64, float64), - - UNARY_SAFE_NULL_IF_NULL(log10, int32, float64), - UNARY_SAFE_NULL_IF_NULL(log10, int64, float64), - UNARY_SAFE_NULL_IF_NULL(log10, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(log10, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(log10, float32, float64), - UNARY_SAFE_NULL_IF_NULL(log10, float64, float64), - - BINARY_UNSAFE_NULL_IF_NULL(log, int32, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, int64, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, uint32, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, uint64, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, float32, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, float64, float64), - - BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, float64), - - // nullable never operations - NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnull), - NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnotnull), - NUMERIC_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnumeric), - - // nullable never binary operations - NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, is_distinct_from), - NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, is_not_distinct_from), - - // date/timestamp operations - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMillennium), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractCentury), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDecade), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractYear), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractQuarter), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMonth), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractWeek), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDay), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractHour), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMinute), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractSecond), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractEpoch), - - BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, date64, date64, float64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, timestamp, timestamp, float64), - - // date_trunc operations on date/timestamp - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Millennium), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Century), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Decade), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Year), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Quarter), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Month), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Week), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Day), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Hour), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Minute), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Second), - - // time operations - TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractHour), - TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMinute), - TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractSecond), - - // timestamp diff operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffSecond, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffMinute, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffHour, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffDay, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffWeek, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffMonth, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffQuarter, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffYear, timestamp, timestamp, int32), - - // timestamp add int32 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, timestamp, int32, timestamp), - // date add int32 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, date64, int32, date64), - - // timestamp add int64 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, timestamp, int64, timestamp), - // date add int64 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, date64, int64, date64), - - // date_add(date64, int32), date_add(timestamp, int32) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, timestamp, int32, timestamp), - - // date_add(date64, int64), date_add(timestamp, int64) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, timestamp, int64, timestamp), - - // date_add(int32, date64), date_add(int32, timestamp) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int32, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int32, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int32, timestamp, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int32, timestamp, timestamp), - - // date_add(int64, date64), date_add(int64, timestamp) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int64, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int64, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int64, timestamp, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int64, timestamp, timestamp), - - // date_sub(date64, int32), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, date64, int32, date64), - // date_sub(timestamp, int32), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, timestamp, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, timestamp, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, timestamp, int32, date64), - - // date_sub(date64, int64), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, date64, int64, date64), - // date_sub(timestamp, int64), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, timestamp, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, timestamp, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, timestamp, int64, date64), - - // hash functions - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32AsDouble), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32AsDouble), - - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64AsDouble), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64AsDouble), - - // utf8/binary operations - UNARY_SAFE_NULL_IF_NULL(octet_length, utf8, int32), - UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32), - UNARY_SAFE_NULL_IF_NULL(bit_length, utf8, int32), - UNARY_SAFE_NULL_IF_NULL(bit_length, binary, int32), - UNARY_UNSAFE_NULL_IF_NULL(char_length, utf8, int32), - UNARY_UNSAFE_NULL_IF_NULL(length, utf8, int32), - UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, binary, int32), - - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, equal), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, not_equal), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than_or_equal_to), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than_or_equal_to), +FunctionRegistry::iterator FunctionRegistry::begin() const { + return &(*pc_registry_.begin()); +} - BINARY_RELATIONAL_SAFE_NULL_IF_NULL(starts_with, utf8), - BINARY_RELATIONAL_SAFE_NULL_IF_NULL(ends_with, utf8), +FunctionRegistry::iterator FunctionRegistry::end() const { + return &(*pc_registry_.end()); +} - NativeFunction("upper", DataTypeVector{utf8()}, utf8(), kResultNullIfNull, - "upper_utf8", NativeFunction::kNeedsContext), +std::vector FunctionRegistry::pc_registry_; - NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), kResultNullIfNull, - "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), +SignatureMap FunctionRegistry::pc_registry_map_ = InitPCMap(); - NativeFunction("castDATE", DataTypeVector{utf8()}, date64(), kResultNullIfNull, - "castDATE_utf8", - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), +SignatureMap FunctionRegistry::InitPCMap() { + SignatureMap map; - NativeFunction("to_date", DataTypeVector{utf8(), utf8(), int32()}, date64(), - kResultNullInternal, "gdv_fn_to_date_utf8_utf8_int32", - NativeFunction::kNeedsContext | NativeFunction::kNeedsFunctionHolder | - NativeFunction::kCanReturnErrors), -}; // namespace gandiva + auto v1 = GetArithmeticFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v1.begin(), v1.end()); -FunctionRegistry::iterator FunctionRegistry::begin() const { - return std::begin(pc_registry_); -} + auto v2 = GetDateTimeFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v2.begin(), v2.end()); -FunctionRegistry::iterator FunctionRegistry::end() const { - return std::end(pc_registry_); -} + auto v3 = GetHashFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v3.begin(), v3.end()); -FunctionRegistry::SignatureMap FunctionRegistry::pc_registry_map_ = InitPCMap(); + auto v4 = GetMathOpsFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v4.begin(), v4.end()); -FunctionRegistry::SignatureMap FunctionRegistry::InitPCMap() { - SignatureMap map; + auto v5 = GetStringFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v5.begin(), v5.end()); - int num_entries = static_cast(sizeof(pc_registry_) / sizeof(NativeFunction)); - for (int i = 0; i < num_entries; i++) { - const NativeFunction* entry = &pc_registry_[i]; + auto v6 = GetDateTimeArithmeticFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v6.begin(), v6.end()); - DCHECK(map.find(&entry->signature()) == map.end()); - map[&entry->signature()] = entry; - // printf("%s -> %s\n", entry->signature().ToString().c_str(), - // entry->pc_name().c_str()); + for (auto& elem : pc_registry_) { + map.insert(std::make_pair(&(elem.signature()), &elem)); } + return map; } const NativeFunction* FunctionRegistry::LookupSignature( const FunctionSignature& signature) const { auto got = pc_registry_map_.find(&signature); - return got == pc_registry_map_.end() ? NULL : got->second; + return got == pc_registry_map_.end() ? nullptr : got->second; } } // namespace gandiva diff --git a/cpp/src/gandiva/function_registry.h b/cpp/src/gandiva/function_registry.h index 0f74089fc6d8e..810bf2d3eb338 100644 --- a/cpp/src/gandiva/function_registry.h +++ b/cpp/src/gandiva/function_registry.h @@ -18,8 +18,8 @@ #ifndef GANDIVA_FUNCTION_REGISTRY_H #define GANDIVA_FUNCTION_REGISTRY_H -#include - +#include +#include "gandiva/function_registry_common.h" #include "gandiva/gandiva_aliases.h" #include "gandiva/native_function.h" @@ -37,28 +37,9 @@ class FunctionRegistry { iterator end() const; private: - struct KeyHash { - std::size_t operator()(const FunctionSignature* k) const { return k->Hash(); } - }; - - struct KeyEquals { - bool operator()(const FunctionSignature* s1, const FunctionSignature* s2) const { - return *s1 == *s2; - } - }; - - static DataTypePtr time32() { return arrow::time32(arrow::TimeUnit::MILLI); } - - static DataTypePtr time64() { return arrow::time64(arrow::TimeUnit::MICRO); } - - static DataTypePtr timestamp() { return arrow::timestamp(arrow::TimeUnit::MILLI); } - - typedef std::unordered_map - SignatureMap; static SignatureMap InitPCMap(); - static NativeFunction pc_registry_[]; + static std::vector pc_registry_; static SignatureMap pc_registry_map_; }; diff --git a/cpp/src/gandiva/function_registry_arithmetic.cc b/cpp/src/gandiva/function_registry_arithmetic.cc new file mode 100644 index 0000000000000..c5a798cb4e235 --- /dev/null +++ b/cpp/src/gandiva/function_registry_arithmetic.cc @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_arithmetic.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define BINARY_SYMMETRIC_FN(name) NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, name) + +#define BINARY_RELATIONAL_BOOL_FN(name) \ + NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name) + +#define BINARY_RELATIONAL_BOOL_DATE_FN(name) \ + NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name) + +#define UNARY_OCTET_LEN_FN(name) \ + UNARY_SAFE_NULL_IF_NULL(name, utf8, int32), UNARY_SAFE_NULL_IF_NULL(name, binary, int32) + +#define UNARY_CAST_TO_FLOAT64(name) UNARY_SAFE_NULL_IF_NULL(castFLOAT8, name, float64) + +#define UNARY_CAST_TO_FLOAT32(name) UNARY_SAFE_NULL_IF_NULL(castFLOAT4, name, float32) + +std::vector GetArithmeticFunctionRegistry() { + static std::vector arithmetic_fn_registry_ = { + UNARY_SAFE_NULL_IF_NULL(not, boolean, boolean), + UNARY_SAFE_NULL_IF_NULL(castBIGINT, int32, int64), + + UNARY_CAST_TO_FLOAT32(int32), + UNARY_CAST_TO_FLOAT32(int64), + + UNARY_CAST_TO_FLOAT64(int32), + UNARY_CAST_TO_FLOAT64(int64), + UNARY_CAST_TO_FLOAT64(float32), + + UNARY_SAFE_NULL_IF_NULL(castDATE, int64, date64), + + BINARY_SYMMETRIC_FN(add), + BINARY_SYMMETRIC_FN(subtract), + BINARY_SYMMETRIC_FN(multiply), + + NUMERIC_TYPES(BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL, divide), + BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int32, int32), + BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int64, int64), + + BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(add, decimal128), + + BINARY_RELATIONAL_BOOL_FN(equal), + BINARY_RELATIONAL_BOOL_FN(not_equal), + + BINARY_RELATIONAL_BOOL_DATE_FN(less_than), + BINARY_RELATIONAL_BOOL_DATE_FN(less_than_or_equal_to), + BINARY_RELATIONAL_BOOL_DATE_FN(greater_than), + BINARY_RELATIONAL_BOOL_DATE_FN(greater_than_or_equal_to), + + UNARY_OCTET_LEN_FN(octet_length), + UNARY_OCTET_LEN_FN(bit_length), + + UNARY_UNSAFE_NULL_IF_NULL(char_length, utf8, int32), + UNARY_UNSAFE_NULL_IF_NULL(length, utf8, int32), + UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, binary, int32)}; + + return arithmetic_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_arithmetic.h b/cpp/src/gandiva/function_registry_arithmetic.h new file mode 100644 index 0000000000000..e98a4e7b5b1b4 --- /dev/null +++ b/cpp/src/gandiva/function_registry_arithmetic.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_ARITHMETIC_H +#define GANDIVA_FUNCTION_REGISTRY_ARITHMETIC_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetArithmeticFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_ARITHMETIC_H diff --git a/cpp/src/gandiva/function_registry_common.h b/cpp/src/gandiva/function_registry_common.h new file mode 100644 index 0000000000000..3ae065a14769d --- /dev/null +++ b/cpp/src/gandiva/function_registry_common.h @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_COMMON_H +#define GANDIVA_FUNCTION_REGISTRY_COMMON_H + +#include +#include +#include + +#include "gandiva/arrow.h" +#include "gandiva/function_signature.h" +#include "gandiva/gandiva_aliases.h" +#include "gandiva/native_function.h" + +/* This is a private file, intended for internal use by gandiva & must not be included + * directly. + */ +namespace gandiva { + +using arrow::binary; +using arrow::boolean; +using arrow::date64; +using arrow::float32; +using arrow::float64; +using arrow::int16; +using arrow::int32; +using arrow::int64; +using arrow::int8; +using arrow::uint16; +using arrow::uint32; +using arrow::uint64; +using arrow::uint8; +using arrow::utf8; +using std::vector; + +inline DataTypePtr time32() { return arrow::time32(arrow::TimeUnit::MILLI); } + +inline DataTypePtr time64() { return arrow::time64(arrow::TimeUnit::MICRO); } + +inline DataTypePtr timestamp() { return arrow::timestamp(arrow::TimeUnit::MILLI); } +inline DataTypePtr decimal128() { return arrow::decimal(0, 0); } + +struct KeyHash { + std::size_t operator()(const FunctionSignature* k) const { return k->Hash(); } +}; + +struct KeyEquals { + bool operator()(const FunctionSignature* s1, const FunctionSignature* s2) const { + return *s1 == *s2; + } +}; + +typedef std::unordered_map + SignatureMap; + +// Binary functions that : +// - have the same input type for both params +// - output type is same as the input type +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 +#define BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE)) + +// Binary functions that : +// - have the same input type for both params +// - NULL handling is of type NULL_IINTERNAL +// - can return error. +// +// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 +#define BINARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(), \ + kResultNullIfNull, ARROW_STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \ + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) + +#define BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(NAME, TYPE) \ + BINARY_UNSAFE_NULL_IF_NULL(NAME, TYPE, TYPE) + +// Binary functions that : +// - have different input types, or output type +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type names. eg. mod_int64_int32 +#define BINARY_GENERIC_SAFE_NULL_IF_NULL(NAME, IN_TYPE1, IN_TYPE2, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE1(), IN_TYPE2()}, OUT_TYPE(), \ + kResultNullIfNull, ARROW_STRINGIFY(NAME##_##IN_TYPE1##_##IN_TYPE2)) + +// Binary functions that : +// - have the same input type +// - output type is boolean +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type names. +// eg. equal_int32_int32 +#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE)) + +// Unary functions that : +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 +#define UNARY_SAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##IN_TYPE)) + +// Unary functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. eg. isnull_int32 +#define UNARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, boolean(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Unary functions that : +// - NULL handling is of type NULL_INTERNAL +// +// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 +#define UNARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##IN_TYPE), \ + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) + +// Binary functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type names, +// eg. is_distinct_from_int32_int32 +#define BINARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE)) + +// Extract functions (used with data/time types) that : +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type name. eg. extractYear_date +#define EXTRACT_SAFE_NULL_IF_NULL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32_int8 +#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32_int8 +#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions with seed that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 +#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##WithSeed_##TYPE)) + +// Hash64 functions with seed that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 +#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##WithSeed_##TYPE)) + +// Iterate the inner macro over all numeric types +#define NUMERIC_TYPES(INNER, NAME) \ + INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \ + INNER(NAME, uint8), INNER(NAME, uint16), INNER(NAME, uint32), INNER(NAME, uint64), \ + INNER(NAME, float32), INNER(NAME, float64) + +// Iterate the inner macro over numeric and date/time types +#define NUMERIC_DATE_TYPES(INNER, NAME) \ + NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME) + +// Iterate the inner macro over all date types +#define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp) + +// Iterate the inner macro over all time types +#define TIME_TYPES(INNER, NAME) INNER(NAME, time32) + +// Iterate the inner macro over all data types +#define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary) + +// Iterate the inner macro over all numeric types, date types and bool type +#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ + NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean) + +// Iterate the inner macro over all numeric types, date types, bool and varlen types +#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \ + NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME) + +} // namespace gandiva + +#endif diff --git a/cpp/src/gandiva/function_registry_datetime.cc b/cpp/src/gandiva/function_registry_datetime.cc new file mode 100644 index 0000000000000..145b7d39395b4 --- /dev/null +++ b/cpp/src/gandiva/function_registry_datetime.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_datetime.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define DATE_EXTRACTION_FNS(name) \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Millennium), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Century), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Decade), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Year), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Quarter), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Month), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Week), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Day), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Hour), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Minute), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Second) + +#define TIME_EXTRACTION_FNS(name) \ + TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Hour), \ + TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Minute), \ + TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Second) + +std::vector GetDateTimeFunctionRegistry() { + static std::vector date_time_fn_registry_ = { + DATE_EXTRACTION_FNS(extract), + DATE_EXTRACTION_FNS(date_trunc_), + + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy), + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow), + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractEpoch), + + TIME_EXTRACTION_FNS(extract), + + NativeFunction("castDATE", DataTypeVector{utf8()}, date64(), kResultNullIfNull, + "castDATE_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + NativeFunction("to_date", DataTypeVector{utf8(), utf8(), int32()}, date64(), + kResultNullInternal, "gdv_fn_to_date_utf8_utf8_int32", + NativeFunction::kNeedsContext | + NativeFunction::kNeedsFunctionHolder | + NativeFunction::kCanReturnErrors)}; + + return date_time_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_datetime.h b/cpp/src/gandiva/function_registry_datetime.h new file mode 100644 index 0000000000000..c9b88942215d8 --- /dev/null +++ b/cpp/src/gandiva/function_registry_datetime.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_DATE_TIME_H +#define GANDIVA_FUNCTION_REGISTRY_DATE_TIME_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetDateTimeFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_DATE_TIME_H diff --git a/cpp/src/gandiva/function_registry_hash.cc b/cpp/src/gandiva/function_registry_hash.cc new file mode 100644 index 0000000000000..a163a230eaca3 --- /dev/null +++ b/cpp/src/gandiva/function_registry_hash.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_hash.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define HASH32_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, name) + +#define HASH32_SEED_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, name) + +#define HASH64_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, name) + +#define HASH64_SEED_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, name) + +std::vector GetHashFunctionRegistry() { + static std::vector hash_fn_registry_ = { + HASH32_SAFE_NULL_NEVER_FN(hash), + HASH32_SAFE_NULL_NEVER_FN(hash32), + HASH32_SAFE_NULL_NEVER_FN(hash32AsDouble), + + HASH32_SEED_SAFE_NULL_NEVER_FN(hash32), + HASH32_SEED_SAFE_NULL_NEVER_FN(hash32AsDouble), + + HASH64_SAFE_NULL_NEVER_FN(hash64), + HASH64_SAFE_NULL_NEVER_FN(hash64AsDouble), + + HASH64_SEED_SAFE_NULL_NEVER_FN(hash64), + HASH64_SEED_SAFE_NULL_NEVER_FN(hash64AsDouble)}; + + return hash_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_hash.h b/cpp/src/gandiva/function_registry_hash.h new file mode 100644 index 0000000000000..dc02cb21e37b5 --- /dev/null +++ b/cpp/src/gandiva/function_registry_hash.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_HASH_H +#define GANDIVA_FUNCTION_REGISTRY_HASH_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetHashFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_HASH_H diff --git a/cpp/src/gandiva/function_registry_math_ops.cc b/cpp/src/gandiva/function_registry_math_ops.cc new file mode 100644 index 0000000000000..31b4b13119a86 --- /dev/null +++ b/cpp/src/gandiva/function_registry_math_ops.cc @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_math_ops.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define MATH_UNARY_OPS(name) \ + UNARY_SAFE_NULL_IF_NULL(name, int32, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, int64, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, uint32, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, uint64, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, float32, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, float64, float64) + +#define MATH_BINARY_UNSAFE(name) \ + BINARY_UNSAFE_NULL_IF_NULL(name, int32, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, int64, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, uint32, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, uint64, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, float32, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, float64, float64) + +#define UNARY_SAFE_NULL_NEVER_BOOL_FN(name) \ + NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, name) + +#define BINARY_SAFE_NULL_NEVER_BOOL_FN(name) \ + NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, name) + +std::vector GetMathOpsFunctionRegistry() { + static std::vector math_fn_registry_ = { + MATH_UNARY_OPS(cbrt), + MATH_UNARY_OPS(exp), + MATH_UNARY_OPS(log), + MATH_UNARY_OPS(log10), + + MATH_BINARY_UNSAFE(log), + + BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, float64), + + UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull), + UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull), + + NUMERIC_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnumeric), + + BINARY_SAFE_NULL_NEVER_BOOL_FN(is_distinct_from), + BINARY_SAFE_NULL_NEVER_BOOL_FN(is_not_distinct_from)}; + + return math_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_math_ops.h b/cpp/src/gandiva/function_registry_math_ops.h new file mode 100644 index 0000000000000..0204ffc8809ac --- /dev/null +++ b/cpp/src/gandiva/function_registry_math_ops.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_MATHOPS_H +#define GANDIVA_FUNCTION_REGISTRY_MATHOPS_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetMathOpsFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_MATHOPS_H diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc new file mode 100644 index 0000000000000..c97925af9cbb3 --- /dev/null +++ b/cpp/src/gandiva/function_registry_string.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_string.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(name) \ + VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name) + +#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(name) \ + BINARY_RELATIONAL_SAFE_NULL_IF_NULL(name, utf8) + +std::vector GetStringFunctionRegistry() { + static std::vector string_fn_registry_ = { + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(equal), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(not_equal), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(less_than), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(less_than_or_equal_to), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(greater_than), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(greater_than_or_equal_to), + + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(starts_with), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(ends_with), + + NativeFunction("upper", DataTypeVector{utf8()}, utf8(), kResultNullIfNull, + "upper_utf8", NativeFunction::kNeedsContext), + + NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), kResultNullIfNull, + "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder)}; + + return string_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_string.h b/cpp/src/gandiva/function_registry_string.h new file mode 100644 index 0000000000000..c9217893e5c0b --- /dev/null +++ b/cpp/src/gandiva/function_registry_string.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_STRING_H +#define GANDIVA_FUNCTION_REGISTRY_STRING_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetStringFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_STRING_H diff --git a/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc b/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc new file mode 100644 index 0000000000000..7af76909b7d8f --- /dev/null +++ b/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_timestamp_arithmetic.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define TIMESTAMP_ADD_FNS(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int32, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int64, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int64, date64) + +#define TIMESTAMP_DIFF_FN(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, timestamp, int32) + +#define DATE_ADD_FNS(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int32, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int64, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int32, date64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int32, timestamp, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int64, date64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int64, timestamp, timestamp) + +#define DATE_DIFF_FNS(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int64, date64) + +std::vector GetDateTimeArithmeticFunctionRegistry() { + static std::vector datetime_fn_registry_ = { + BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, date64, date64, float64), + BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, timestamp, timestamp, float64), + + TIMESTAMP_DIFF_FN(timestampdiffSecond), + TIMESTAMP_DIFF_FN(timestampdiffMinute), + TIMESTAMP_DIFF_FN(timestampdiffHour), + TIMESTAMP_DIFF_FN(timestampdiffDay), + TIMESTAMP_DIFF_FN(timestampdiffWeek), + TIMESTAMP_DIFF_FN(timestampdiffMonth), + TIMESTAMP_DIFF_FN(timestampdiffQuarter), + TIMESTAMP_DIFF_FN(timestampdiffYear), + + TIMESTAMP_ADD_FNS(timestampaddSecond), + TIMESTAMP_ADD_FNS(timestampaddMinute), + TIMESTAMP_ADD_FNS(timestampaddHour), + TIMESTAMP_ADD_FNS(timestampaddDay), + TIMESTAMP_ADD_FNS(timestampaddWeek), + TIMESTAMP_ADD_FNS(timestampaddMonth), + TIMESTAMP_ADD_FNS(timestampaddQuarter), + TIMESTAMP_ADD_FNS(timestampaddYear), + + DATE_ADD_FNS(date_add), + DATE_ADD_FNS(add), + + DATE_DIFF_FNS(date_sub), + DATE_DIFF_FNS(subtract), + DATE_DIFF_FNS(date_diff)}; + + return datetime_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_timestamp_arithmetic.h b/cpp/src/gandiva/function_registry_timestamp_arithmetic.h new file mode 100644 index 0000000000000..f1b97093663ba --- /dev/null +++ b/cpp/src/gandiva/function_registry_timestamp_arithmetic.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_TIMESTAMP_ARITHMETIC_H +#define GANDIVA_FUNCTION_REGISTRY_TIMESTAMP_ARITHMETIC_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetDateTimeArithmeticFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_TIMESTAMP_ARITHMETIC_H diff --git a/cpp/src/gandiva/function_signature.h b/cpp/src/gandiva/function_signature.h index e5dff245b158f..ee82abc367e20 100644 --- a/cpp/src/gandiva/function_signature.h +++ b/cpp/src/gandiva/function_signature.h @@ -56,10 +56,22 @@ class FunctionSignature { std::string ToString() const; private: - // TODO : for some of the types, this shouldn't match type specific data. eg. for - // decimals, this shouldn't match precision/scale. bool DataTypeEquals(const DataTypePtr left, const DataTypePtr right) const { - return left->Equals(right); + if (left->id() == right->id()) { + switch (left->id()) { + case arrow::Type::DECIMAL: { + // For decimal types, the precision/scale isn't part of the signature. + auto dleft = arrow::internal::checked_cast(left.get()); + auto dright = arrow::internal::checked_cast(right.get()); + return (dleft != NULL) && (dright != NULL) && + (dleft->byte_width() == dright->byte_width()); + } + default: + return left->Equals(right); + } + } else { + return false; + } } std::string base_name_; diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index 9f7bc526dbf5b..a07d3903a75ac 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. -project(gandiva_jni) +if(CMAKE_VERSION VERSION_LESS 3.11) + message(FATAL_ERROR "Building the Gandiva JNI bindings requires CMake version >= 3.11") +endif() # Find JNI find_package(JNI REQUIRED) diff --git a/cpp/src/gandiva/jni/expression_registry_helper.cc b/cpp/src/gandiva/jni/expression_registry_helper.cc index 5227329db472a..b5c6880a25cf1 100644 --- a/cpp/src/gandiva/jni/expression_registry_helper.cc +++ b/cpp/src/gandiva/jni/expression_registry_helper.cc @@ -121,10 +121,15 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) case arrow::Type::type::NA: gandiva_data_type->set_type(types::GandivaType::NONE); break; + case arrow::Type::type::DECIMAL: { + gandiva_data_type->set_type(types::GandivaType::DECIMAL); + gandiva_data_type->set_precision(0); + gandiva_data_type->set_scale(0); + break; + } case arrow::Type::type::FIXED_SIZE_BINARY: case arrow::Type::type::MAP: case arrow::Type::type::INTERVAL: - case arrow::Type::type::DECIMAL: case arrow::Type::type::LIST: case arrow::Type::type::STRUCT: case arrow::Type::type::UNION: diff --git a/cpp/src/gandiva/jni/jni_common.cc b/cpp/src/gandiva/jni/jni_common.cc index 639ad361f4a8a..339b0cd4e5a9d 100644 --- a/cpp/src/gandiva/jni/jni_common.cc +++ b/cpp/src/gandiva/jni/jni_common.cc @@ -381,6 +381,12 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { return TreeExprBuilder::MakeBinaryLiteral(node.binarynode().value()); } + if (node.has_decimalnode()) { + std::string value = node.decimalnode().value(); + gandiva::DecimalScalar128 literal(value, node.decimalnode().precision(), + node.decimalnode().scale()); + return TreeExprBuilder::MakeDecimalLiteral(literal); + } std::cerr << "Unknown node type in protobuf\n"; return nullptr; } diff --git a/cpp/src/gandiva/jni/symbols.map b/cpp/src/gandiva/jni/symbols.map index a387ae1f5af6b..e0f5def41f00e 100644 --- a/cpp/src/gandiva/jni/symbols.map +++ b/cpp/src/gandiva/jni/symbols.map @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. { - global: extern "C++" { gandiva*; Java*; JNI*; }; + global: extern "C++" { gandiva*; }; Java*; JNI*; local: *; }; diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index d659b22c46e34..f4bbc512e5d6a 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -50,39 +50,40 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { } } - // didn't hit any of the optimisation paths. return original. + // Could not optimize, return original node. return node; } +static bool IsArrowStringLiteral(arrow::Type::type type) { + return type == arrow::Type::STRING || type == arrow::Type::BINARY; +} + Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { - if (node.children().size() != 2) { - return Status::Invalid("'like' function requires two parameters"); - } + ARROW_RETURN_IF(node.children().size() != 2, + Status::Invalid("'like' function requires two parameters")); auto literal = dynamic_cast(node.children().at(1).get()); - if (literal == nullptr) { - return Status::Invalid("'like' function requires a literal as the second parameter"); - } + ARROW_RETURN_IF( + literal == nullptr, + Status::Invalid("'like' function requires a literal as the second parameter")); auto literal_type = literal->return_type()->id(); - if (literal_type != arrow::Type::STRING && literal_type != arrow::Type::BINARY) { - return Status::Invalid( - "'like' function requires a string literal as the second parameter"); - } - auto pattern = boost::get(literal->holder()); - return Make(pattern, holder); + ARROW_RETURN_IF( + !IsArrowStringLiteral(literal_type), + Status::Invalid( + "'like' function requires a string literal as the second parameter")); + + return Make(literal->holder().get(), holder); } Status LikeHolder::Make(const std::string& sql_pattern, std::shared_ptr* holder) { std::string pcre_pattern; - auto status = RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - if (!lholder->regex_.ok()) { - return Status::Invalid("building re2 regex failed for pattern " + pcre_pattern); - } + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); *holder = lholder; return Status::OK(); diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 3e3cd37c4fed1..d0ce8bb595021 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -84,6 +84,16 @@ TEST_F(TestLikeHolder, TestRegexEscape) { EXPECT_EQ(res, "%hello_abc.def#"); } +TEST_F(TestLikeHolder, TestDot) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("abc.", &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_FALSE(like("abcd")); +} + TEST_F(TestLikeHolder, TestOptimise) { // optimise for 'starts_with' auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%")); diff --git a/cpp/src/gandiva/literal_holder.h b/cpp/src/gandiva/literal_holder.h index 0a65ea2c3e249..36afdd3c874e2 100644 --- a/cpp/src/gandiva/literal_holder.h +++ b/cpp/src/gandiva/literal_holder.h @@ -20,14 +20,16 @@ #include -#include +#include + +#include +#include "gandiva/decimal_scalar.h" namespace gandiva { using LiteralHolder = - boost::variant; - + arrow::util::variant; } // namespace gandiva #endif // GANDIVA_LITERAL_HOLDER diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 82d0386cfb9f3..c6844cfe5dd8a 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -44,10 +44,10 @@ LLVMGenerator::LLVMGenerator() Status LLVMGenerator::Make(std::shared_ptr config, std::unique_ptr* llvm_generator) { std::unique_ptr llvmgen_obj(new LLVMGenerator()); - Status status = Engine::Make(config, &(llvmgen_obj->engine_)); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(Engine::Make(config, &(llvmgen_obj->engine_))); *llvm_generator = std::move(llvmgen_obj); + return Status::OK(); } @@ -57,33 +57,29 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out // decompose the expression to separate out value and validities. ExprDecomposer decomposer(function_registry_, annotator_); ValueValidityPairPtr value_validity; - auto status = decomposer.Decompose(*expr->root(), &value_validity); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(decomposer.Decompose(*expr->root(), &value_validity)); // Generate the IR function for the decomposed expression. llvm::Function* ir_function = nullptr; - status = CodeGenExprValue(value_validity->value_expr(), output, idx, &ir_function); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK( + CodeGenExprValue(value_validity->value_expr(), output, idx, &ir_function)); std::unique_ptr compiled_expr( new CompiledExpr(value_validity, output, ir_function)); compiled_exprs_.push_back(std::move(compiled_expr)); + return Status::OK(); } /// Build and optimise module for projection expression. Status LLVMGenerator::Build(const ExpressionVector& exprs) { - Status status; - for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); - status = Add(expr, output); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(Add(expr, output)); } - // optimise, compile and finalize the module - status = engine_->FinalizeModule(optimise_ir_, dump_ir_); - ARROW_RETURN_NOT_OK(status); + // Optimize, compile and finalize the module + ARROW_RETURN_NOT_OK(engine_->FinalizeModule(optimise_ir_, dump_ir_)); // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { @@ -91,6 +87,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs) { EvalFunc fn = reinterpret_cast(engine_->CompiledFunction(ir_func)); compiled_expr->set_jit_function(fn); } + return Status::OK(); } @@ -107,13 +104,15 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, EvalFunc jit_function = compiled_expr->jit_function(); jit_function(eval_batch->GetBufferArray(), eval_batch->GetLocalBitMapArray(), (int64_t)eval_batch->GetExecutionContext(), record_batch.num_rows()); - // check for execution errors - if (eval_batch->GetExecutionContext()->has_error()) { - return Status::ExecutionError(eval_batch->GetExecutionContext()->get_error()); - } + + ARROW_RETURN_IF( + eval_batch->GetExecutionContext()->has_error(), + Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); + // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, *eval_batch); } + return Status::OK(); } @@ -233,8 +232,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, FieldDescriptorPtr out engine_->AddFunctionToCompile(func_name); *fn = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, func_name, module()); - ARROW_RETURN_FAILURE_IF_FALSE((*fn != nullptr), - Status::CodeGenError("Error creating function.")); + ARROW_RETURN_IF((*fn == nullptr), Status::CodeGenError("Error creating function.")); + // Name the arguments llvm::Function::arg_iterator args = (*fn)->arg_begin(); llvm::Value* arg_addrs = &*args; @@ -396,9 +395,21 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args, full_name); DCHECK(value->getType() == ret_type); } + return value; } +std::shared_ptr LLVMGenerator::BuildDecimalLValue(llvm::Value* value, + DataTypePtr arrow_type) { + // only decimals of size 128-bit supported. + DCHECK(is_decimal_128(arrow_type)); + auto decimal_type = + arrow::internal::checked_cast(arrow_type.get()); + return std::make_shared(value, nullptr, + types()->i32_constant(decimal_type->precision()), + types()->i32_constant(decimal_type->scale())); +} + #define ADD_VISITOR_TRACE(...) \ if (generator_->enable_ir_traces_) { \ generator_->AddTrace(__VA_ARGS__); \ @@ -422,20 +433,33 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); - llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); - llvm::Value* slot_value; - if (dex.FieldType()->id() == arrow::Type::BOOL) { - slot_value = generator_->GetPackedBitValue(slot_ref, loop_var_); - } else { - llvm::Value* slot_offset = builder->CreateGEP(slot_ref, loop_var_); - slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); - } + std::shared_ptr lvalue; + switch (dex.FieldType()->id()) { + case arrow::Type::BOOL: + slot_value = generator_->GetPackedBitValue(slot_ref, loop_var_); + lvalue = std::make_shared(slot_value); + break; + + case arrow::Type::DECIMAL: { + auto slot_offset = builder->CreateGEP(slot_ref, loop_var_); + slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); + lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType()); + break; + } + + default: { + auto slot_offset = builder->CreateGEP(slot_ref, loop_var_); + slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); + lvalue = std::make_shared(slot_value); + break; + } + } ADD_VISITOR_TRACE("visit fixed-len data vector " + dex.FieldName() + " value %T", slot_value); - result_.reset(new LValue(slot_value)); + result_ = lvalue; } void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { @@ -503,52 +527,52 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { switch (dex.type()->id()) { case arrow::Type::BOOL: - value = types->i1_constant(boost::get(dex.holder())); + value = types->i1_constant(dex.holder().get()); break; case arrow::Type::UINT8: - value = types->i8_constant(boost::get(dex.holder())); + value = types->i8_constant(dex.holder().get()); break; case arrow::Type::UINT16: - value = types->i16_constant(boost::get(dex.holder())); + value = types->i16_constant(dex.holder().get()); break; case arrow::Type::UINT32: - value = types->i32_constant(boost::get(dex.holder())); + value = types->i32_constant(dex.holder().get()); break; case arrow::Type::UINT64: - value = types->i64_constant(boost::get(dex.holder())); + value = types->i64_constant(dex.holder().get()); break; case arrow::Type::INT8: - value = types->i8_constant(boost::get(dex.holder())); + value = types->i8_constant(dex.holder().get()); break; case arrow::Type::INT16: - value = types->i16_constant(boost::get(dex.holder())); + value = types->i16_constant(dex.holder().get()); break; case arrow::Type::INT32: - value = types->i32_constant(boost::get(dex.holder())); + value = types->i32_constant(dex.holder().get()); break; case arrow::Type::INT64: - value = types->i64_constant(boost::get(dex.holder())); + value = types->i64_constant(dex.holder().get()); break; case arrow::Type::FLOAT: - value = types->float_constant(boost::get(dex.holder())); + value = types->float_constant(dex.holder().get()); break; case arrow::Type::DOUBLE: - value = types->double_constant(boost::get(dex.holder())); + value = types->double_constant(dex.holder().get()); break; case arrow::Type::STRING: case arrow::Type::BINARY: { - const std::string& str = boost::get(dex.holder()); + const std::string& str = dex.holder().get(); llvm::Constant* str_int_cast = types->i64_constant((int64_t)str.c_str()); value = llvm::ConstantExpr::getIntToPtr(str_int_cast, types->i8_ptr_type()); @@ -557,21 +581,36 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { } case arrow::Type::DATE64: - value = types->i64_constant(boost::get(dex.holder())); + value = types->i64_constant(dex.holder().get()); break; case arrow::Type::TIME32: - value = types->i32_constant(boost::get(dex.holder())); + value = types->i32_constant(dex.holder().get()); break; case arrow::Type::TIME64: - value = types->i64_constant(boost::get(dex.holder())); + value = types->i64_constant(dex.holder().get()); break; case arrow::Type::TIMESTAMP: - value = types->i64_constant(boost::get(dex.holder())); + value = types->i64_constant(dex.holder().get()); break; + case arrow::Type::DECIMAL: { + // build code for struct + auto scalar = dex.holder().get(); + // ConstantInt doesn't have a get method that takes int128 or a pair of int64. so, + // passing the string representation instead. + auto int128_value = + llvm::ConstantInt::get(llvm::Type::getInt128Ty(*generator_->context()), + Decimal128(scalar.value()).ToIntegerString(), 10); + auto type = arrow::decimal(scalar.precision(), scalar.scale()); + auto lvalue = generator_->BuildDecimalLValue(int128_value, type); + // set it as the l-value and return. + result_ = lvalue; + return; + } + default: DCHECK(0); } @@ -589,13 +628,14 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { auto params = BuildParams(dex.function_holder().get(), dex.args(), false, native_function->NeedsContext()); + auto arrow_return_type = dex.func_descriptor()->return_type(); if (native_function->CanReturnErrors()) { // slow path : if a function can return errors, skip invoking the function // unless all of the input args are valid. Otherwise, it can cause spurious errors. llvm::IRBuilder<>* builder = ir_builder(); LLVMTypes* types = generator_->types(); - auto arrow_type_id = native_function->signature().ret_type()->id(); + auto arrow_type_id = arrow_return_type->id(); auto result_type = types->IRType(arrow_type_id); // Build combined validity of the args. @@ -609,7 +649,7 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { auto then_lambda = [&] { ADD_VISITOR_TRACE("fn " + function_name + " can return errors : all args valid, invoke fn"); - return BuildFunctionCall(native_function, ¶ms); + return BuildFunctionCall(native_function, arrow_return_type, ¶ms); }; // else block @@ -624,10 +664,10 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { return std::make_shared(else_value, else_value_len); }; - result_ = BuildIfElse(is_valid, then_lambda, else_lambda, result_type); + result_ = BuildIfElse(is_valid, then_lambda, else_lambda, arrow_return_type); } else { // fast path : invoke function without computing validities. - result_ = BuildFunctionCall(native_function, ¶ms); + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); } } @@ -639,7 +679,8 @@ void LLVMGenerator::Visitor::Visit(const NullableNeverFuncDex& dex) { auto params = BuildParams(dex.function_holder().get(), dex.args(), true, native_function->NeedsContext()); - result_ = BuildFunctionCall(native_function, ¶ms); + auto arrow_return_type = dex.func_descriptor()->return_type(); + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); } void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { @@ -659,7 +700,8 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); params.push_back(result_valid_ptr); - result_ = BuildFunctionCall(native_function, ¶ms); + auto arrow_return_type = dex.func_descriptor()->return_type(); + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); // load the result validity and truncate to i1. llvm::Value* result_valid_i8 = builder->CreateLoad(result_valid_ptr); @@ -672,7 +714,6 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { void LLVMGenerator::Visitor::Visit(const IfDex& dex) { ADD_VISITOR_TRACE("visit IfExpression"); llvm::IRBuilder<>* builder = ir_builder(); - LLVMTypes* types = generator_->types(); // Evaluate condition. LValuePtr if_condition = BuildValueAndValidity(dex.condition_vv()); @@ -714,9 +755,8 @@ void LLVMGenerator::Visitor::Visit(const IfDex& dex) { }; // build the if-else condition. - auto result_type = types->IRType(dex.result_type()->id()); - result_ = BuildIfElse(validAndMatched, then_lambda, else_lambda, result_type); - if (result_type == types->i8_ptr_type()) { + result_ = BuildIfElse(validAndMatched, then_lambda, else_lambda, dex.result_type()); + if (arrow::is_binary_like(dex.result_type()->id())) { ADD_VISITOR_TRACE("IfElse result length %T", result_->length()); } ADD_VISITOR_TRACE("IfElse result value %T", result_->data()); @@ -906,7 +946,7 @@ void LLVMGenerator::Visitor::VisitInExpression(const InExprDexBase& dex) { LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, std::function then_func, std::function else_func, - llvm::Type* result_type) { + DataTypePtr result_type) { llvm::IRBuilder<>* builder = ir_builder(); llvm::LLVMContext* context = generator_->context(); LLVMTypes* types = generator_->types(); @@ -936,17 +976,31 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, // Emit the merge block. builder->SetInsertPoint(merge_bb); - llvm::PHINode* result_value = builder->CreatePHI(result_type, 2, "res_value"); + auto llvm_type = types->IRType(result_type->id()); + llvm::PHINode* result_value = builder->CreatePHI(llvm_type, 2, "res_value"); result_value->addIncoming(then_lvalue->data(), then_bb); result_value->addIncoming(else_lvalue->data(), else_bb); - llvm::PHINode* result_length = nullptr; - if (result_type == types->i8_ptr_type()) { - result_length = builder->CreatePHI(types->i32_type(), 2, "res_length"); - result_length->addIncoming(then_lvalue->length(), then_bb); - result_length->addIncoming(else_lvalue->length(), else_bb); + LValuePtr ret; + switch (result_type->id()) { + case arrow::Type::STRING: { + llvm::PHINode* result_length; + result_length = builder->CreatePHI(types->i32_type(), 2, "res_length"); + result_length->addIncoming(then_lvalue->length(), then_bb); + result_length->addIncoming(else_lvalue->length(), else_bb); + ret = std::make_shared(result_value, result_length); + break; + } + + case arrow::Type::DECIMAL: + ret = generator_->BuildDecimalLValue(result_value, result_type); + break; + + default: + ret = std::make_shared(result_value); + break; } - return std::make_shared(result_value, result_length); + return ret; } LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& pair) { @@ -963,25 +1017,46 @@ LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& } LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, + DataTypePtr arrow_return_type, std::vector* params) { - auto arrow_return_type = func->signature().ret_type()->id(); - auto llvm_return_type = generator_->types()->IRType(arrow_return_type); - - // add extra arg for return length for variable len return types (alloced on stack). - llvm::AllocaInst* result_len_ptr = nullptr; - if (arrow::is_binary_like(arrow_return_type)) { - result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, - "result_len", entry_block_); - params->push_back(result_len_ptr); - has_arena_allocs_ = true; - } + auto types = generator_->types(); + auto arrow_return_type_id = arrow_return_type->id(); + auto llvm_return_type = types->IRType(arrow_return_type_id); + + if (arrow_return_type_id == arrow::Type::DECIMAL) { + // For decimal fns, the output precision/scale are passed along as parameters. + // + // convert from this : + // out = add_decimal(v1, p1, s1, v2, p2, s2) + // to: + // out = add_decimal(v1, p1, s1, v2, p2, s2, out_p, out_s) + + // Append the out_precision and out_scale + auto ret_lvalue = generator_->BuildDecimalLValue(nullptr, arrow_return_type); + params->push_back(ret_lvalue->precision()); + params->push_back(ret_lvalue->scale()); + + // Make the function call + auto out = generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params); + ret_lvalue->set_data(out); + return std::move(ret_lvalue); + } else { + // add extra arg for return length for variable len return types (alloced on stack). + llvm::AllocaInst* result_len_ptr = nullptr; + if (arrow::is_binary_like(arrow_return_type_id)) { + result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, + "result_len", entry_block_); + params->push_back(result_len_ptr); + has_arena_allocs_ = true; + } - // Make the function call - llvm::IRBuilder<>* builder = ir_builder(); - auto value = generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params); - auto value_len = - (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr); - return std::make_shared(value, value_len); + // Make the function call + llvm::IRBuilder<>* builder = ir_builder(); + auto value = generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params); + auto value_len = + (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr); + return std::make_shared(value, value_len); + } } std::vector LLVMGenerator::Visitor::BuildParams( @@ -1007,12 +1082,9 @@ std::vector LLVMGenerator::Visitor::BuildParams( DexPtr value_expr = pair->value_expr(); value_expr->Accept(*this); LValue& result_ref = *result(); - params.push_back(result_ref.data()); - // build length (for var len data types) - if (result_ref.length() != nullptr) { - params.push_back(result_ref.length()); - } + // append all the parameters corresponding to this LValue. + result_ref.AppendFunctionParams(¶ms); // build validity. if (with_validity) { diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 49f209d280d13..937e5acc87b2e 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -119,12 +119,13 @@ class LLVMGenerator { bool with_validity, bool with_context); // Generate code to onvoke a function call. - LValuePtr BuildFunctionCall(const NativeFunction* func, + LValuePtr BuildFunctionCall(const NativeFunction* func, DataTypePtr arrow_return_type, std::vector* params); // Generate code for an if-else condition. LValuePtr BuildIfElse(llvm::Value* condition, std::function then_func, - std::function else_func, llvm::Type* result_type); + std::function else_func, + DataTypePtr arrow_return_type); // Switch to the entry_block and get reference of the validity/value/offsets buffer llvm::Value* GetBufferReference(int idx, BufferType buffer_type, FieldPtr field); @@ -184,6 +185,10 @@ class LLVMGenerator { void ClearPackedBitValueIfFalse(llvm::Value* bitmap, llvm::Value* position, llvm::Value* value); + // Generate code to build a DecimalLValue with specified value/precision/scale. + std::shared_ptr BuildDecimalLValue(llvm::Value* value, + DataTypePtr arrow_type); + /// Generate code to make a function call (to a pre-compiled IR function) which takes /// 'args' and has a return type 'ret_type'. llvm::Value* AddFunctionCall(const std::string& full_name, llvm::Type* ret_type, diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 818c7912150a9..fed6339314850 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -26,6 +26,7 @@ #include "gandiva/expression.h" #include "gandiva/func_descriptor.h" #include "gandiva/function_registry.h" +#include "gandiva/tests/test_util.h" namespace gandiva { @@ -39,8 +40,7 @@ class TestLLVMGenerator : public ::testing::Test { // Verify that a valid pc function exists for every function in the registry. TEST_F(TestLLVMGenerator, VerifyPCFunctions) { std::unique_ptr generator; - Status status = - LLVMGenerator::Make(ConfigurationBuilder::DefaultConfiguration(), &generator); + auto status = LLVMGenerator::Make(TestConfiguration(), &generator); EXPECT_TRUE(status.ok()) << status.message(); llvm::Module* module = generator->module(); @@ -54,8 +54,7 @@ TEST_F(TestLLVMGenerator, VerifyPCFunctions) { TEST_F(TestLLVMGenerator, TestAdd) { // Setup LLVM generator to do an arithmetic add of two vectors std::unique_ptr generator; - Status status = - LLVMGenerator::Make(ConfigurationBuilder::DefaultConfiguration(), &generator); + auto status = LLVMGenerator::Make(TestConfiguration(), &generator); EXPECT_TRUE(status.ok()); Annotator annotator; diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index 0b89d96e3fb02..18ff627a5651f 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -40,6 +40,7 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::TIMESTAMP, i64_type()}, {arrow::Type::type::STRING, i8_ptr_type()}, {arrow::Type::type::BINARY, i8_ptr_type()}, + {arrow::Type::type::DECIMAL, i128_type()}, }; } diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index dab47d059f7f2..9cf4dd5d1c850 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -43,6 +43,8 @@ class LLVMTypes { llvm::Type* i64_type() { return llvm::Type::getInt64Ty(context_); } + llvm::Type* i128_type() { return llvm::Type::getInt128Ty(context_); } + llvm::Type* float_type() { return llvm::Type::getFloatTy(context_); } llvm::Type* double_type() { return llvm::Type::getDoubleTy(context_); } @@ -53,12 +55,19 @@ class LLVMTypes { llvm::PointerType* i64_ptr_type() { return llvm::PointerType::get(i64_type(), 0); } - llvm::PointerType* ptr_type(llvm::Type* base_type) { - return llvm::PointerType::get(base_type, 0); + llvm::PointerType* i128_ptr_type() { return llvm::PointerType::get(i128_type(), 0); } + + llvm::StructType* i128_split_type() { + // struct with high/low bits (see decimal_ops.cc:DecimalSplit) + return llvm::StructType::get(context_, {i64_type(), i64_type()}, false); } llvm::Type* void_type() { return llvm::Type::getVoidTy(context_); } + llvm::PointerType* ptr_type(llvm::Type* base_type) { + return llvm::PointerType::get(base_type, 0); + } + llvm::Constant* true_constant() { return llvm::ConstantInt::get(context_, llvm::APInt(1, 1)); } @@ -87,6 +96,18 @@ class LLVMTypes { return llvm::ConstantInt::get(context_, llvm::APInt(64, val)); } + llvm::Constant* i128_constant(int64_t val) { + return llvm::ConstantInt::get(context_, llvm::APInt(128, val)); + } + + llvm::Constant* i128_zero() { + return llvm::ConstantInt::get(context_, llvm::APInt(128, 0)); + } + + llvm::Constant* i128_one() { + return llvm::ConstantInt::get(context_, llvm::APInt(128, 1)); + } + llvm::Constant* float_constant(float val) { return llvm::ConstantFP::get(float_type(), val); } diff --git a/cpp/src/gandiva/local_bitmaps_holder.h b/cpp/src/gandiva/local_bitmaps_holder.h index 1dc82562e3110..ae0ba53e99003 100644 --- a/cpp/src/gandiva/local_bitmaps_holder.h +++ b/cpp/src/gandiva/local_bitmaps_holder.h @@ -50,10 +50,10 @@ class LocalBitMapsHolder { int64_t num_records_; /// A container of 'local_bitmaps_', each sized to accomodate 'num_records'. - std::vector> local_bitmaps_vec_; + std::vector> local_bitmaps_vec_; /// An array of the local bitmaps. - std::unique_ptr local_bitmaps_array_; + std::unique_ptr local_bitmaps_array_; int64_t local_bitmap_size_; }; @@ -72,7 +72,7 @@ inline LocalBitMapsHolder::LocalBitMapsHolder(int64_t num_records, int num_local // Alloc 'num_local_bitmaps_' number of bitmaps, each of capacity 'num_records_'. for (int i = 0; i < num_local_bitmaps; ++i) { // TODO : round-up to a slab friendly multiple. - std::unique_ptr bitmap(new uint8_t[local_bitmap_size_]); + std::unique_ptr bitmap(new uint8_t[local_bitmap_size_]); // keep pointer to the bitmap in the array. (local_bitmaps_array_.get())[i] = bitmap.get(); diff --git a/cpp/src/gandiva/lru_cache_test.cc b/cpp/src/gandiva/lru_cache_test.cc index 230a811fc1b31..8ac04c3461b7e 100644 --- a/cpp/src/gandiva/lru_cache_test.cc +++ b/cpp/src/gandiva/lru_cache_test.cc @@ -59,6 +59,6 @@ TEST_F(TestLruCache, TestLruBehavior) { cache_.get(TestCacheKey(1)); cache_.insert(TestCacheKey(3), "hello"); // should have evicted key 2. - ASSERT_EQ(cache_.get(TestCacheKey(1)).value(), "hello"); + ASSERT_EQ(*cache_.get(TestCacheKey(1)), "hello"); } } // namespace gandiva diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 2ff03dcdd9c56..ce5040f6c37a6 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -18,9 +18,11 @@ #ifndef GANDIVA_LVALUE_H #define GANDIVA_LVALUE_H -#include "arrow/util/macros.h" +#include #include +#include "arrow/util/macros.h" +#include "gandiva/logging.h" namespace gandiva { @@ -30,17 +32,48 @@ class LValue { explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR, llvm::Value* validity = NULLPTR) : data_(data), length_(length), validity_(validity) {} + virtual ~LValue() = default; llvm::Value* data() { return data_; } llvm::Value* length() { return length_; } llvm::Value* validity() { return validity_; } + void set_data(llvm::Value* data) { data_ = data; } + + // Append the params required when passing this as a function parameter. + virtual void AppendFunctionParams(std::vector* params) { + params->push_back(data_); + if (length_ != NULLPTR) { + params->push_back(length_); + } + } + private: llvm::Value* data_; llvm::Value* length_; llvm::Value* validity_; }; +class DecimalLValue : public LValue { + public: + DecimalLValue(llvm::Value* data, llvm::Value* validity, llvm::Value* precision, + llvm::Value* scale) + : LValue(data, NULLPTR, validity), precision_(precision), scale_(scale) {} + + llvm::Value* precision() { return precision_; } + llvm::Value* scale() { return scale_; } + + void AppendFunctionParams(std::vector* params) override { + LValue::AppendFunctionParams(params); + params->push_back(precision_); + params->push_back(scale_); + } + + private: + llvm::Value* precision_; + llvm::Value* scale_; +}; + } // namespace gandiva #endif // GANDIVA_LVALUE_H diff --git a/cpp/src/gandiva/native_function.h b/cpp/src/gandiva/native_function.h index 7a250e01cb619..5b130a9313c5b 100644 --- a/cpp/src/gandiva/native_function.h +++ b/cpp/src/gandiva/native_function.h @@ -52,7 +52,6 @@ class NativeFunction { bool NeedsFunctionHolder() const { return (flags_ & kNeedsFunctionHolder) != 0; } bool CanReturnErrors() const { return (flags_ & kCanReturnErrors) != 0; } - private: NativeFunction(const std::string& base_name, const DataTypeVector& param_types, DataTypePtr ret_type, const ResultNullableType& result_nullable_type, const std::string& pc_name, int32_t flags = 0) @@ -61,6 +60,7 @@ class NativeFunction { result_nullable_type_(result_nullable_type), pc_name_(pc_name) {} + private: FunctionSignature signature_; /// attributes @@ -69,8 +69,6 @@ class NativeFunction { /// pre-compiled function name. std::string pc_name_; - - friend class FunctionRegistry; }; } // end namespace gandiva diff --git a/cpp/src/gandiva/node.h b/cpp/src/gandiva/node.h index d31924aa73017..77cde680d1ce8 100644 --- a/cpp/src/gandiva/node.h +++ b/cpp/src/gandiva/node.h @@ -76,12 +76,12 @@ class LiteralNode : public Node { // The default formatter prints in decimal can cause a loss in precision. so, // print in hex. Can't use hexfloat since gcc 4.9 doesn't support it. if (return_type()->id() == arrow::Type::DOUBLE) { - double dvalue = boost::get(holder_); + double dvalue = holder_.get(); uint64_t bits; memcpy(&bits, &dvalue, sizeof(bits)); ss << " raw(" << std::hex << bits << ")"; } else if (return_type()->id() == arrow::Type::FLOAT) { - float fvalue = boost::get(holder_); + float fvalue = holder_.get(); uint32_t bits; memcpy(&bits, &fvalue, sizeof(bits)); ss << " raw(" << std::hex << bits << ")"; diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 886fdced887ff..6e0a0926d3155 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -20,12 +20,15 @@ project(gandiva) set(PRECOMPILED_SRCS arithmetic_ops.cc bitmap.cc + decimal_ops.cc + decimal_wrapper.cc extended_math_ops.cc hash.cc print.cc string_ops.cc time.cc - timestamp_arithmetic.cc) + timestamp_arithmetic.cc + ../../arrow/util/basic_decimal.cc) # Create bitcode for each of the source files. foreach(SRC_FILE ${PRECOMPILED_SRCS}) @@ -35,9 +38,14 @@ foreach(SRC_FILE ${PRECOMPILED_SRCS}) add_custom_command( OUTPUT ${BC_FILE} COMMAND ${CLANG_EXECUTABLE} - -std=c++11 -emit-llvm -O2 -c ${ABSOLUTE_SRC} -o ${BC_FILE} + -DGANDIVA_IR + -std=c++11 -emit-llvm + -fno-use-cxa-atexit # Workaround for unresolved __dso_handle + -O3 -c ${ABSOLUTE_SRC} -o ${BC_FILE} + ${ARROW_GANDIVA_PC_CXX_FLAGS} -I${CMAKE_SOURCE_DIR}/src - DEPENDS ${SRC_FILE}) + DEPENDS ${SRC_FILE} + COMMAND_EXPAND_LISTS) list(APPEND BC_FILES ${BC_FILE}) endforeach() @@ -58,18 +66,18 @@ function(add_precompiled_unit_test REL_TEST_NAME) set(TEST_NAME "gandiva-precompiled-${TEST_NAME}") add_executable(${TEST_NAME} ${REL_TEST_NAME} ${ARGN}) - add_dependencies(gandiva ${TEST_NAME}) + add_dependencies(gandiva-tests ${TEST_NAME}) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_TEST_LINK_LIBS} ${RE2_LIBRARY} ) target_compile_definitions(${TEST_NAME} PRIVATE GANDIVA_UNIT_TEST=1) add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva;unittest ${TEST_NAME}) + set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva-tests {TEST_NAME}) endfunction(add_precompiled_unit_test REL_TEST_NAME) # testing -if (ARROW_GANDIVA_BUILD_TESTS) +if (ARROW_BUILD_TESTS) add_precompiled_unit_test(bitmap_test.cc bitmap.cc) add_precompiled_unit_test(epoch_time_point_test.cc) add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc ../context_helper.cc) @@ -77,4 +85,5 @@ if (ARROW_GANDIVA_BUILD_TESTS) add_precompiled_unit_test(string_ops_test.cc string_ops.cc ../context_helper.cc) add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc ../context_helper.cc) add_precompiled_unit_test(extended_math_ops_test.cc extended_math_ops.cc ../context_helper.cc) + add_precompiled_unit_test(decimal_ops_test.cc decimal_ops.cc ../decimal_type_util.cc) endif() diff --git a/cpp/src/gandiva/precompiled/decimal_ops.cc b/cpp/src/gandiva/precompiled/decimal_ops.cc new file mode 100644 index 0000000000000..99231fe537f7a --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_ops.cc @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Alogrithms adapted from Apache Impala + +#include "gandiva/precompiled/decimal_ops.h" + +#include + +#include "gandiva/decimal_type_util.h" +#include "gandiva/logging.h" + +namespace gandiva { +namespace decimalops { + +using arrow::BasicDecimal128; + +static BasicDecimal128 CheckAndIncreaseScale(const BasicDecimal128& in, int32_t delta) { + return (delta <= 0) ? in : in.IncreaseScaleBy(delta); +} + +static BasicDecimal128 CheckAndReduceScale(const BasicDecimal128& in, int32_t delta) { + return (delta <= 0) ? in : in.ReduceScaleBy(delta); +} + +/// Adjust x and y to the same scale, and add them. +static BasicDecimal128 AddFastPath(const BasicDecimalScalar128& x, + const BasicDecimalScalar128& y, int32_t out_scale) { + auto higher_scale = std::max(x.scale(), y.scale()); + + auto x_scaled = CheckAndIncreaseScale(x.value(), higher_scale - x.scale()); + auto y_scaled = CheckAndIncreaseScale(y.value(), higher_scale - y.scale()); + return x_scaled + y_scaled; +} + +/// Add x and y, caller has ensured there can be no overflow. +static BasicDecimal128 AddNoOverflow(const BasicDecimalScalar128& x, + const BasicDecimalScalar128& y, int32_t out_scale) { + auto higher_scale = std::max(x.scale(), y.scale()); + auto sum = AddFastPath(x, y, out_scale); + return CheckAndReduceScale(sum, higher_scale - out_scale); +} + +/// Both x_value and y_value must be >= 0 +static BasicDecimal128 AddLargePositive(const BasicDecimalScalar128& x, + const BasicDecimalScalar128& y, + int32_t out_scale) { + DCHECK_GE(x.value(), 0); + DCHECK_GE(y.value(), 0); + + // separate out whole/fractions. + BasicDecimal128 x_left, x_right, y_left, y_right; + x.value().GetWholeAndFraction(x.scale(), &x_left, &x_right); + y.value().GetWholeAndFraction(y.scale(), &y_left, &y_right); + + // Adjust fractional parts to higher scale. + auto higher_scale = std::max(x.scale(), y.scale()); + auto x_right_scaled = CheckAndIncreaseScale(x_right, higher_scale - x.scale()); + auto y_right_scaled = CheckAndIncreaseScale(y_right, higher_scale - y.scale()); + + BasicDecimal128 right; + BasicDecimal128 carry_to_left; + auto multiplier = BasicDecimal128::GetScaleMultiplier(higher_scale); + if (x_right_scaled >= multiplier - y_right_scaled) { + right = x_right_scaled - (multiplier - y_right_scaled); + carry_to_left = 1; + } else { + right = x_right_scaled + y_right_scaled; + carry_to_left = 0; + } + right = CheckAndReduceScale(right, higher_scale - out_scale); + + auto left = x_left + y_left + carry_to_left; + return (left * BasicDecimal128::GetScaleMultiplier(out_scale)) + right; +} + +/// x_value and y_value cannot be 0, and one must be positive and the other negative. +static BasicDecimal128 AddLargeNegative(const BasicDecimalScalar128& x, + const BasicDecimalScalar128& y, + int32_t out_scale) { + DCHECK_NE(x.value(), 0); + DCHECK_NE(y.value(), 0); + DCHECK((x.value() < 0 && y.value() > 0) || (x.value() > 0 && y.value() < 0)); + + // separate out whole/fractions. + BasicDecimal128 x_left, x_right, y_left, y_right; + x.value().GetWholeAndFraction(x.scale(), &x_left, &x_right); + y.value().GetWholeAndFraction(y.scale(), &y_left, &y_right); + + // Adjust fractional parts to higher scale. + auto higher_scale = std::max(x.scale(), y.scale()); + x_right = CheckAndIncreaseScale(x_right, higher_scale - x.scale()); + y_right = CheckAndIncreaseScale(y_right, higher_scale - y.scale()); + + // Overflow not possible because one is +ve and the other is -ve. + auto left = x_left + y_left; + auto right = x_right + y_right; + + // If the whole and fractional parts have different signs, then we need to make the + // fractional part have the same sign as the whole part. If either left or right is + // zero, then nothing needs to be done. + if (left < 0 && right > 0) { + left += 1; + right -= BasicDecimal128::GetScaleMultiplier(higher_scale); + } else if (left > 0 && right < 0) { + left -= 1; + right += BasicDecimal128::GetScaleMultiplier(higher_scale); + } + right = CheckAndReduceScale(right, higher_scale - out_scale); + return (left * BasicDecimal128::GetScaleMultiplier(out_scale)) + right; +} + +static BasicDecimal128 AddLarge(const BasicDecimalScalar128& x, + const BasicDecimalScalar128& y, int32_t out_scale) { + if (x.value() >= 0 && y.value() >= 0) { + // both positive or 0 + return AddLargePositive(x, y, out_scale); + } else if (x.value() <= 0 && y.value() <= 0) { + // both negative or 0 + BasicDecimalScalar128 x_neg(-x.value(), x.precision(), x.scale()); + BasicDecimalScalar128 y_neg(-y.value(), y.precision(), y.scale()); + return -AddLargePositive(x_neg, y_neg, out_scale); + } else { + // one positive and the other negative + return AddLargeNegative(x, y, out_scale); + } +} + +// Suppose we have a number that requires x bits to be represented and we scale it up by +// 10^scale_by. Let's say now y bits are required to represent it. This function returns +// the maximum possible y - x for a given 'scale_by'. +inline int32_t MaxBitsRequiredIncreaseAfterScaling(int32_t scale_by) { + // We rely on the following formula: + // bits_required(x * 10^y) <= bits_required(x) + floor(log2(10^y)) + 1 + // We precompute floor(log2(10^x)) + 1 for x = 0, 1, 2...75, 76 + DCHECK_GE(scale_by, 0); + DCHECK_LE(scale_by, 76); + static const int32_t floor_log2_plus_one[] = { + 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40, 44, 47, 50, + 54, 57, 60, 64, 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, 100, 103, + 107, 110, 113, 117, 120, 123, 127, 130, 133, 137, 140, 143, 147, 150, 153, 157, + 160, 163, 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, 200, 203, 206, 210, + 213, 216, 220, 223, 226, 230, 233, 236, 240, 243, 246, 250, 253}; + return floor_log2_plus_one[scale_by]; +} + +// If we have a number with 'num_lz' leading zeros, and we scale it up by 10^scale_by, +// this function returns the minimum number of leading zeros the result can have. +inline int32_t MinLeadingZerosAfterScaling(int32_t num_lz, int32_t scale_by) { + DCHECK_GE(scale_by, 0); + DCHECK_LE(scale_by, 76); + int32_t result = num_lz - MaxBitsRequiredIncreaseAfterScaling(scale_by); + return result; +} + +// Returns the maximum possible number of bits required to represent num * 10^scale_by. +inline int32_t MaxBitsRequiredAfterScaling(const BasicDecimalScalar128& num, + int32_t scale_by) { + auto value = num.value(); + auto value_abs = value.Abs(); + + int32_t num_occupied = 128 - value_abs.CountLeadingBinaryZeros(); + DCHECK_GE(scale_by, 0); + DCHECK_LE(scale_by, 76); + return num_occupied + MaxBitsRequiredIncreaseAfterScaling(scale_by); +} + +// Returns the minimum number of leading zero x or y would have after one of them gets +// scaled up to match the scale of the other one. +inline int32_t MinLeadingZeros(const BasicDecimalScalar128& x, + const BasicDecimalScalar128& y) { + auto x_value = x.value(); + auto x_value_abs = x_value.Abs(); + + auto y_value = y.value(); + auto y_value_abs = y_value.Abs(); + + int32_t x_lz = x_value_abs.CountLeadingBinaryZeros(); + int32_t y_lz = y_value_abs.CountLeadingBinaryZeros(); + if (x.scale() < y.scale()) { + x_lz = MinLeadingZerosAfterScaling(x_lz, y.scale() - x.scale()); + } else if (x.scale() > y.scale()) { + y_lz = MinLeadingZerosAfterScaling(y_lz, x.scale() - y.scale()); + } + return std::min(x_lz, y_lz); +} + +BasicDecimal128 Add(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y, + int32_t out_precision, int32_t out_scale) { + if (out_precision < DecimalTypeUtil::kMaxPrecision) { + // fast-path add + return AddFastPath(x, y, out_scale); + } else { + int32_t min_lz = MinLeadingZeros(x, y); + if (min_lz >= 3) { + // If both numbers have at least MIN_LZ leading zeros, we can add them directly + // without the risk of overflow. + // We want the result to have at least 2 leading zeros, which ensures that it fits + // into the maximum decimal because 2^126 - 1 < 10^38 - 1. If both x and y have at + // least 3 leading zeros, then we are guaranteed that the result will have at lest 2 + // leading zeros. + return AddNoOverflow(x, y, out_scale); + } else { + // slower-version : add whole/fraction parts separately, and then, combine. + return AddLarge(x, y, out_scale); + } + } +} + +} // namespace decimalops +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/decimal_ops.h b/cpp/src/gandiva/precompiled/decimal_ops.h new file mode 100644 index 0000000000000..1e202b88a2515 --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_ops.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include "gandiva/basic_decimal_scalar.h" + +namespace gandiva { +namespace decimalops { + +/// Return the sum of 'x' and 'y'. +/// out_precision and out_scale are passed along for efficiency, they must match +/// the rules in DecimalTypeSql::GetResultType. +arrow::BasicDecimal128 Add(const BasicDecimalScalar128& x, const BasicDecimalScalar128& y, + int32_t out_precision, int32_t out_scale); + +} // namespace decimalops +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/decimal_ops_test.cc b/cpp/src/gandiva/precompiled/decimal_ops_test.cc new file mode 100644 index 0000000000000..6e58106044753 --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_ops_test.cc @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/test-util.h" +#include "gandiva/decimal_scalar.h" +#include "gandiva/decimal_type_util.h" +#include "gandiva/precompiled/decimal_ops.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { + +class TestDecimalSql : public ::testing::Test { + protected: + static void AddAndVerify(const DecimalScalar128& x, const DecimalScalar128& y, + const DecimalScalar128& expected); +}; + +#define EXPECT_DECIMAL_EQ(x, y, expected, actual) \ + EXPECT_EQ(expected, actual) << (x).ToString() << " + " << (y).ToString() \ + << " expected : " << expected.ToString() << " actual " \ + << actual.ToString() + +void TestDecimalSql::AddAndVerify(const DecimalScalar128& x, const DecimalScalar128& y, + const DecimalScalar128& expected) { + auto t1 = std::make_shared(x.precision(), x.scale()); + auto t2 = std::make_shared(y.precision(), y.scale()); + + Decimal128TypePtr out_type; + EXPECT_OK(DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, {t1, t2}, &out_type)); + + auto out_value = decimalops::Add(x, y, out_type->precision(), out_type->scale()); + EXPECT_DECIMAL_EQ( + x, y, expected, + DecimalScalar128(out_value, out_type->precision(), out_type->scale())); +} + +TEST_F(TestDecimalSql, Add) { + // fast-path + AddAndVerify(DecimalScalar128{"201", 30, 3}, // x + DecimalScalar128{"301", 30, 3}, // y + DecimalScalar128{"502", 31, 3}); // expected + + // max precision + AddAndVerify(DecimalScalar128{"09999999999999999999999999999999000000", 38, 5}, // x + DecimalScalar128{"100", 38, 7}, // y + DecimalScalar128{"99999999999999999999999999999990000010", 38, 6}); + + // Both -ve + AddAndVerify(DecimalScalar128{"-201", 30, 3}, // x + DecimalScalar128{"-301", 30, 2}, // y + DecimalScalar128{"-3211", 32, 3}); // expected + + // -ve and max precision + AddAndVerify(DecimalScalar128{"-09999999999999999999999999999999000000", 38, 5}, // x + DecimalScalar128{"-100", 38, 7}, // y + DecimalScalar128{"-99999999999999999999999999999990000010", 38, 6}); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/decimal_wrapper.cc b/cpp/src/gandiva/precompiled/decimal_wrapper.cc new file mode 100644 index 0000000000000..f327a50cce663 --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_wrapper.cc @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/precompiled/decimal_ops.h" +#include "gandiva/precompiled/types.h" + +extern "C" { + +FORCE_INLINE +void add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision, + int32_t x_scale, int64_t y_high, uint64_t y_low, + int32_t y_precision, int32_t y_scale, + int32_t out_precision, int32_t out_scale, + int64_t* out_high, uint64_t* out_low) { + gandiva::BasicDecimalScalar128 x(x_high, x_low, x_precision, x_scale); + gandiva::BasicDecimalScalar128 y(y_high, y_low, y_precision, y_scale); + + arrow::BasicDecimal128 out = gandiva::decimalops::Add(x, y, out_precision, out_scale); + *out_high = out.high_bits(); + *out_low = out.low_bits(); +} + +} // extern "C" diff --git a/cpp/src/gandiva/precompiled/epoch_time_point.h b/cpp/src/gandiva/precompiled/epoch_time_point.h index dc6340d134e0a..115f019525118 100644 --- a/cpp/src/gandiva/precompiled/epoch_time_point.h +++ b/cpp/src/gandiva/precompiled/epoch_time_point.h @@ -19,7 +19,7 @@ #define GANDIVA_EPOCH_TIME_POINT_H // TODO(wesm): IR compilation does not have any include directories set -#include "../../arrow/util/date.h" +#include "../../arrow/vendored/date.h" // A point of time measured in millis since epoch. class EpochTimePoint { diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 8020a45b3d302..8fc5b8c446927 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -45,12 +45,10 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::shared_ptr configuration, std::shared_ptr* projector) { - ARROW_RETURN_FAILURE_IF_FALSE(schema != nullptr, - Status::Invalid("schema cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(!exprs.empty(), - Status::Invalid("expressions need to be non-empty")); - ARROW_RETURN_FAILURE_IF_FALSE(configuration != nullptr, - Status::Invalid("configuration cannot be null")); + ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null")); + ARROW_RETURN_IF(exprs.empty(), Status::Invalid("Expressions cannot be empty")); + ARROW_RETURN_IF(configuration == nullptr, + Status::Invalid("Configuration cannot be null")); // see if equivalent projector was already built static Cache> cache; @@ -63,23 +61,21 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Build LLVM generator, and generate code for the specified expressions std::unique_ptr llvm_gen; - Status status = LLVMGenerator::Make(configuration, &llvm_gen); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen)); // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { - status = expr_validator.Validate(expr); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(expr_validator.Validate(expr)); } - status = llvm_gen->Build(exprs); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_gen->Build(exprs)); // save the output field types. Used for validation at Evaluate() time. std::vector output_fields; + output_fields.reserve(exprs.size()); for (auto& expr : exprs) { output_fields.push_back(expr->result()); } @@ -94,133 +90,109 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) { - Status status = ValidateEvaluateArgsCommon(batch); - ARROW_RETURN_NOT_OK(status); - - if (output_data_vecs.size() != output_fields_.size()) { - std::stringstream ss; - ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() - << ", expected " << output_fields_.size(); - return Status::Invalid(ss.str()); - } + ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + ARROW_RETURN_IF( + output_data_vecs.size() != output_fields_.size(), + Status::Invalid("Number of output buffers must match number of fields")); int idx = 0; for (auto& array_data : output_data_vecs) { + const auto output_field = output_fields_[idx]; if (array_data == nullptr) { - std::stringstream ss; - ss << "array for output field " << output_fields_[idx]->name() << "is null."; - return Status::Invalid(ss.str()); + return Status::Invalid("Output array for field ", output_field->name(), + " should not be null"); } - Status status = - ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), batch.num_rows()); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK( + ValidateArrayDataCapacity(*array_data, *output_field, batch.num_rows())); ++idx; } + return llvm_generator_->Execute(batch, output_data_vecs); } Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool, arrow::ArrayVector* output) { - Status status = ValidateEvaluateArgsCommon(batch); - ARROW_RETURN_NOT_OK(status); - - if (output == nullptr) { - return Status::Invalid("output must be non-null."); - } - - if (pool == nullptr) { - return Status::Invalid("memory pool must be non-null."); - } + ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); + ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); // Allocate the output data vecs. ArrayDataVector output_data_vecs; + output_data_vecs.reserve(output_fields_.size()); for (auto& field : output_fields_) { ArrayDataPtr output_data; - status = AllocArrayData(field->type(), batch.num_rows(), pool, &output_data); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK( + AllocArrayData(field->type(), batch.num_rows(), pool, &output_data)); output_data_vecs.push_back(output_data); } // Execute the expression(s). - status = llvm_generator_->Execute(batch, output_data_vecs); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, output_data_vecs)); // Create and return array arrays. output->clear(); for (auto& array_data : output_data_vecs) { output->push_back(arrow::MakeArray(array_data)); } + return Status::OK(); } // TODO : handle variable-len vectors Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::MemoryPool* pool, ArrayDataPtr* array_data) { - if (!arrow::is_primitive(type->id())) { - return Status::Invalid("Unsupported output data type " + type->ToString()); - } + const auto* fw_type = dynamic_cast(type.get()); + ARROW_RETURN_IF(fw_type == nullptr, + Status::Invalid("Unsupported output data type ", type)); - arrow::Status astatus; std::shared_ptr null_bitmap; - int64_t size = arrow::BitUtil::BytesForBits(num_records); - astatus = arrow::AllocateBuffer(pool, size, &null_bitmap); - ARROW_RETURN_NOT_OK(astatus); + int64_t bitmap_bytes = arrow::BitUtil::BytesForBits(num_records); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, bitmap_bytes, &null_bitmap)); std::shared_ptr data; - const auto& fw_type = dynamic_cast(*type); - int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); - astatus = arrow::AllocateBuffer(pool, data_len, &data); - ARROW_RETURN_NOT_OK(astatus); + int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type->bit_width()); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, data_len, &data)); + + // This is not strictly required but valgrind gets confused and detects this + // as uninitialized memory access. See arrow::util::SetBitTo(). + if (type->id() == arrow::Type::BOOL) { + memset(data->mutable_data(), 0, data_len); + } *array_data = arrow::ArrayData::Make(type, num_records, {null_bitmap, data}); return Status::OK(); } Status Projector::ValidateEvaluateArgsCommon(const arrow::RecordBatch& batch) { - if (!batch.schema()->Equals(*schema_)) { - return Status::Invalid("Schema in RecordBatch must match the schema in Make()"); - } - if (batch.num_rows() == 0) { - return Status::Invalid("RecordBatch must be non-empty."); - } + ARROW_RETURN_IF(!batch.schema()->Equals(*schema_), + Status::Invalid("Schema in RecordBatch must match schema in Make()")); + ARROW_RETURN_IF(batch.num_rows() == 0, + Status::Invalid("RecordBatch must be non-empty.")); + return Status::OK(); } Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, const arrow::Field& field, int64_t num_records) { - // verify that there are atleast two buffers (validity and data). - if (array_data.buffers.size() < 2) { - std::stringstream ss; - ss << "number of buffers for output field " << field.name() << "is " - << array_data.buffers.size() << ", must have minimum 2."; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(array_data.buffers.size() < 2, + Status::Invalid("ArrayData must have at least 2 buffers")); - // verify size of bitmap buffer. int64_t min_bitmap_len = arrow::BitUtil::BytesForBits(num_records); int64_t bitmap_len = array_data.buffers[0]->capacity(); - if (bitmap_len < min_bitmap_len) { - std::stringstream ss; - ss << "bitmap buffer for output field " << field.name() << "has size " << bitmap_len - << ", must have minimum size " << min_bitmap_len; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(bitmap_len < min_bitmap_len, + Status::Invalid("Bitmap buffer too small for ", field.name())); // verify size of data buffer. // TODO : handle variable-len vectors const auto& fw_type = dynamic_cast(*field.type()); int64_t min_data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); int64_t data_len = array_data.buffers[1]->capacity(); - if (data_len < min_data_len) { - std::stringstream ss; - ss << "data buffer for output field " << field.name() << " has size " << data_len - << ", must have minimum size " << min_data_len; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(data_len < min_data_len, + Status::Invalid("Data buffer too small for ", field.name())); + return Status::OK(); } diff --git a/cpp/src/gandiva/projector_cache_key.h b/cpp/src/gandiva/projector_cache_key.h index e5839163b4d18..26da5288e5d15 100644 --- a/cpp/src/gandiva/projector_cache_key.h +++ b/cpp/src/gandiva/projector_cache_key.h @@ -41,7 +41,7 @@ class ProjectorCacheKey { boost::hash_combine(result, expr_as_string); UpdateUniqifier(expr_as_string); } - boost::hash_combine(result, configuration); + boost::hash_combine(result, configuration->Hash()); boost::hash_combine(result, schema_->ToString()); boost::hash_combine(result, uniqifier_); hash_code_ = result; @@ -55,7 +55,7 @@ class ProjectorCacheKey { return false; } - if (configuration_ != other.configuration_) { + if (*configuration_ != *other.configuration_) { return false; } diff --git a/cpp/src/gandiva/proto/Types.proto b/cpp/src/gandiva/proto/Types.proto index ac19d0f1c1919..7474065f68b73 100644 --- a/cpp/src/gandiva/proto/Types.proto +++ b/cpp/src/gandiva/proto/Types.proto @@ -146,6 +146,13 @@ message BinaryNode { optional bytes value = 1; } +message DecimalNode { + optional string value = 1; + optional int32 precision = 2; + optional int32 scale = 3; +} + + message TreeNode { optional FieldNode fieldNode = 1; optional FunctionNode fnNode = 2; @@ -164,6 +171,7 @@ message TreeNode { optional DoubleNode doubleNode = 16; optional StringNode stringNode = 17; optional BinaryNode binaryNode = 18; + optional DecimalNode decimalNode = 19; } message ExpressionRoot { diff --git a/cpp/src/gandiva/regex_util.cc b/cpp/src/gandiva/regex_util.cc index 893af095a3dd2..abdd579d1f5e4 100644 --- a/cpp/src/gandiva/regex_util.cc +++ b/cpp/src/gandiva/regex_util.cc @@ -20,7 +20,7 @@ namespace gandiva { const std::set RegexUtil::pcre_regex_specials_ = { - '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\'}; + '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\', '.'}; Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char escape_char, std::string& pcre_pattern) { @@ -38,20 +38,16 @@ Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char esca if (cur == escape_char) { // escape char must be followed by '_', '%' or the escape char itself. ++idx; - if (idx == sql_pattern.size()) { - std::stringstream msg; - msg << "unexpected escape char at the end of pattern " << sql_pattern; - return Status::Invalid(msg.str()); - } + ARROW_RETURN_IF( + idx == sql_pattern.size(), + Status::Invalid("Unexpected escape char at the end of pattern ", sql_pattern)); cur = sql_pattern.at(idx); if (cur == '_' || cur == '%' || cur == escape_char) { pcre_pattern += cur; } else { - std::stringstream msg; - msg << "invalid escape sequence in pattern " << sql_pattern << " at offset " - << idx; - return Status::Invalid(msg.str()); + return Status::Invalid("Invalid escape sequence in pattern ", sql_pattern, + " at offset ", idx); } } else if (cur == '_') { pcre_pattern += '.'; diff --git a/cpp/src/gandiva/selection_vector.cc b/cpp/src/gandiva/selection_vector.cc index 9266ca7fe1056..f89b80c2b510f 100644 --- a/cpp/src/gandiva/selection_vector.cc +++ b/cpp/src/gandiva/selection_vector.cc @@ -28,22 +28,15 @@ namespace gandiva { Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap_size, int64_t max_bitmap_index) { - if (bitmap_size % 8 != 0) { - std::stringstream ss; - ss << "bitmap size " << bitmap_size << " must be padded to 64-bit size"; - return Status::Invalid(ss.str()); - } - if (max_bitmap_index < 0) { - std::stringstream ss; - ss << "max bitmap index " << max_bitmap_index << " must be positive"; - return Status::Invalid(ss.str()); - } - if (static_cast(max_bitmap_index) > GetMaxSupportedValue()) { - std::stringstream ss; - ss << "max_bitmap_index " << max_bitmap_index << " must be <= maxSupportedValue " - << GetMaxSupportedValue() << " in selection vector"; - return Status::Invalid(ss.str()); - } + const uint64_t max_idx = static_cast(max_bitmap_index); + ARROW_RETURN_IF(bitmap_size % 8, Status::Invalid("Bitmap size ", bitmap_size, + " must be aligned to 64-bit size")); + ARROW_RETURN_IF(max_bitmap_index < 0, + Status::Invalid("Max bitmap index must be positive")); + ARROW_RETURN_IF( + max_idx > GetMaxSupportedValue(), + Status::Invalid("max_bitmap_index ", max_idx, " must be <= maxSupportedValue ", + GetMaxSupportedValue(), " in selection vector")); int64_t max_slots = GetMaxSlots(); @@ -64,9 +57,9 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap break; } - if (selection_idx >= max_slots) { - return Status::Invalid("selection vector has no remaining slots"); - } + ARROW_RETURN_IF(selection_idx >= max_slots, + Status::Invalid("selection vector has no remaining slots")); + SetIndex(selection_idx, pos_in_bitmap); ++selection_idx; @@ -81,60 +74,54 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap Status SelectionVector::MakeInt16(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt16::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt16::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt16(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt32(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt32::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt32::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt32(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt64(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt64::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt64::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt64(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } @@ -142,8 +129,7 @@ template Status SelectionVectorImpl::AllocateBuffer( int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* buffer) { auto buffer_len = max_slots * sizeof(C_TYPE); - auto astatus = arrow::AllocateBuffer(pool, buffer_len, buffer); - ARROW_RETURN_NOT_OK(astatus); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, buffer_len, buffer)); return Status::OK(); } @@ -151,19 +137,13 @@ Status SelectionVectorImpl::AllocateBuffer( template Status SelectionVectorImpl::ValidateBuffer( int64_t max_slots, std::shared_ptr buffer) { - // verify buffer is mutable - if (!buffer->is_mutable()) { - return Status::Invalid("buffer for selection vector must be mutable"); - } + ARROW_RETURN_IF(!buffer->is_mutable(), + Status::Invalid("buffer for selection vector must be mutable")); + + const int64_t min_len = max_slots * sizeof(C_TYPE); + ARROW_RETURN_IF(buffer->size() < min_len, + Status::Invalid("Buffer for selection vector is too small")); - // verify size of buffer. - int64_t min_len = max_slots * sizeof(C_TYPE); - if (buffer->size() < min_len) { - std::stringstream ss; - ss << "buffer for selection_data has size " << buffer->size() - << ", must have minimum size " << min_len; - return Status::Invalid(ss.str()); - } return Status::OK(); } diff --git a/cpp/src/gandiva/selection_vector_test.cc b/cpp/src/gandiva/selection_vector_test.cc index acb0f338cd6ae..67389273c82f2 100644 --- a/cpp/src/gandiva/selection_vector_test.cc +++ b/cpp/src/gandiva/selection_vector_test.cc @@ -18,6 +18,7 @@ #include "gandiva/selection_vector.h" #include +#include #include @@ -102,15 +103,14 @@ TEST_F(TestSelectionVector, TestInt16PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); @@ -127,15 +127,14 @@ TEST_F(TestSelectionVector, TestInt16PopulateFromBitMapNegative) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = 16; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 1); - arrow::BitUtil::SetBit(bitmap.get(), 2); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 1); + arrow::BitUtil::SetBit(&bitmap[0], 2); // The bitmap has three set bits, whereas the selection vector has capacity for only 2. - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, 2); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, 2); EXPECT_EQ(status.IsInvalid(), true); } @@ -175,15 +174,14 @@ TEST_F(TestSelectionVector, TestInt32PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); @@ -243,15 +241,14 @@ TEST_F(TestSelectionVector, TestInt64PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 1fd30aac495cf..c81618e8ebf32 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -27,6 +27,8 @@ ADD_GANDIVA_TEST(to_string_test) ADD_GANDIVA_TEST(hash_test) ADD_GANDIVA_TEST(in_expr_test) ADD_GANDIVA_TEST(null_validity_test) +ADD_GANDIVA_TEST(decimal_test) +ADD_GANDIVA_TEST(decimal_single_test) ADD_GANDIVA_TEST(projector_test_static SOURCES projector_test.cc diff --git a/cpp/src/gandiva/tests/binary_test.cc b/cpp/src/gandiva/tests/binary_test.cc index d5d99db910b9d..6ac3c5155196e 100644 --- a/cpp/src/gandiva/tests/binary_test.cc +++ b/cpp/src/gandiva/tests/binary_test.cc @@ -61,7 +61,7 @@ TEST_F(TestBinary, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/boolean_expr_test.cc b/cpp/src/gandiva/tests/boolean_expr_test.cc index 3351ab3ccf3ff..9226f357159c6 100644 --- a/cpp/src/gandiva/tests/boolean_expr_test.cc +++ b/cpp/src/gandiva/tests/boolean_expr_test.cc @@ -60,7 +60,7 @@ TEST_F(TestBooleanExpr, SimpleAnd) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // FALSE_VALID && ? => FALSE_VALID @@ -133,7 +133,7 @@ TEST_F(TestBooleanExpr, SimpleOr) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // TRUE_VALID && ? => TRUE_VALID @@ -210,7 +210,7 @@ TEST_F(TestBooleanExpr, AndThree) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 8; @@ -257,7 +257,7 @@ TEST_F(TestBooleanExpr, OrThree) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 8; @@ -317,7 +317,7 @@ TEST_F(TestBooleanExpr, BooleanAndInsideIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 4; @@ -368,7 +368,7 @@ TEST_F(TestBooleanExpr, IfInsideBooleanAnd) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 4; diff --git a/cpp/src/gandiva/tests/date_time_test.cc b/cpp/src/gandiva/tests/date_time_test.cc index 3914558d716c7..643b8c8dda3ce 100644 --- a/cpp/src/gandiva/tests/date_time_test.cc +++ b/cpp/src/gandiva/tests/date_time_test.cc @@ -73,7 +73,8 @@ TEST_F(TestProjector, TestIsNull) { auto isnotnull_expr = TreeExprBuilder::MakeExpression("isnotnull", {t0}, b0); std::shared_ptr projector; - Status status = Projector::Make(schema, {isnull_expr, isnotnull_expr}, &projector); + auto status = Projector::Make(schema, {isnull_expr, isnotnull_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); int num_records = 4; @@ -126,8 +127,9 @@ TEST_F(TestProjector, TestDateTime) { auto ts2day_expr = TreeExprBuilder::MakeExpression("extractDay", {field2}, field_day); std::shared_ptr projector; - Status status = Projector::Make( - schema, {date2year_expr, date2month_expr, ts2month_expr, ts2day_expr}, &projector); + auto status = Projector::Make( + schema, {date2year_expr, date2month_expr, ts2month_expr, ts2day_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); struct tm y1970; @@ -196,7 +198,8 @@ TEST_F(TestProjector, TestTime) { TreeExprBuilder::MakeExpression("extractHour", {field0}, field_hour); std::shared_ptr projector; - Status status = Projector::Make(schema, {time2min_expr, time2hour_expr}, &projector); + auto status = Projector::Make(schema, {time2min_expr, time2hour_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); // create input data @@ -264,7 +267,7 @@ TEST_F(TestProjector, TestTimestampDiff) { std::shared_ptr projector; auto exprs = {diff_secs_expr, diff_mins_expr, diff_hours_expr, diff_days_expr, diff_weeks_expr, diff_months_expr, diff_quarters_expr, diff_years_expr}; - Status status = Projector::Make(schema, exprs, &projector); + auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); struct tm y1970; @@ -337,7 +340,8 @@ TEST_F(TestProjector, TestMonthsBetween) { TreeExprBuilder::MakeExpression("months_between", {f0, f1}, output); std::shared_ptr projector; - Status status = Projector::Make(schema, {months_between_expr}, &projector); + auto status = + Projector::Make(schema, {months_between_expr}, TestConfiguration(), &projector); std::cout << status.message(); ASSERT_TRUE(status.ok()); diff --git a/cpp/src/gandiva/tests/decimal_single_test.cc b/cpp/src/gandiva/tests/decimal_single_test.cc new file mode 100644 index 0000000000000..776ef6efbd0d9 --- /dev/null +++ b/cpp/src/gandiva/tests/decimal_single_test.cc @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include "arrow/memory_pool.h" +#include "arrow/status.h" + +#include "gandiva/decimal_scalar.h" +#include "gandiva/decimal_type_util.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +using arrow::Decimal128; + +namespace gandiva { + +#define EXPECT_DECIMAL_SUM_EQUALS(x, y, expected, actual) \ + EXPECT_EQ(expected, actual) << (x).ToString() << " + " << (y).ToString() \ + << " expected : " << (expected).ToString() \ + << " actual : " << (actual).ToString(); + +DecimalScalar128 decimal_literal(const char* value, int precision, int scale) { + std::string value_string = std::string(value); + return DecimalScalar128(value_string, precision, scale); +} + +class TestDecimalOps : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + ArrayPtr MakeDecimalVector(const DecimalScalar128& in); + void AddAndVerify(const DecimalScalar128& x, const DecimalScalar128& y, + const DecimalScalar128& expected); + + protected: + arrow::MemoryPool* pool_; +}; + +ArrayPtr TestDecimalOps::MakeDecimalVector(const DecimalScalar128& in) { + std::vector ret; + + Decimal128 decimal_value = in.value(); + + auto decimal_type = std::make_shared(in.precision(), in.scale()); + return MakeArrowArrayDecimal(decimal_type, {decimal_value}, {true}); +} + +void TestDecimalOps::AddAndVerify(const DecimalScalar128& x, const DecimalScalar128& y, + const DecimalScalar128& expected) { + auto x_type = std::make_shared(x.precision(), x.scale()); + auto y_type = std::make_shared(y.precision(), y.scale()); + auto field_x = field("x", x_type); + auto field_y = field("y", y_type); + auto schema = arrow::schema({field_x, field_y}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, {x_type, y_type}, + &output_type); + EXPECT_OK(status); + + // output fields + auto res = field("res", output_type); + + // build expression : x + y + auto expr = TreeExprBuilder::MakeExpression("add", {field_x, field_y}, res); + + // Build a projector for the expression. + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_OK(status); + + // Create a row-batch with some sample data + auto array_a = MakeDecimalVector(x); + auto array_b = MakeDecimalVector(y); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, 1 /*num_records*/, {array_a, array_b}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_OK(status); + + // Validate results + auto out_array = dynamic_cast(outputs[0].get()); + const Decimal128 out_value(out_array->GetValue(0)); + + auto dtype = dynamic_cast(out_array->type().get()); + std::string value_string = out_value.ToString(0); + DecimalScalar128 actual{value_string, dtype->precision(), dtype->scale()}; + + EXPECT_DECIMAL_SUM_EQUALS(x, y, expected, actual); +} + +TEST_F(TestDecimalOps, TestAdd) { + // fast-path + AddAndVerify(decimal_literal("201", 30, 3), // x + decimal_literal("301", 30, 3), // y + decimal_literal("502", 31, 3)); // expected + + AddAndVerify(decimal_literal("201", 30, 3), // x + decimal_literal("301", 30, 2), // y + decimal_literal("3211", 32, 3)); // expected + + AddAndVerify(decimal_literal("201", 30, 3), // x + decimal_literal("301", 30, 4), // y + decimal_literal("2311", 32, 4)); // expected + + // max precision, but no overflow + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 3), // y + decimal_literal("502", 38, 3)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 2), // y + decimal_literal("3211", 38, 3)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 4), // y + decimal_literal("2311", 38, 4)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 7), // y + decimal_literal("201030", 38, 6)); // expected + + AddAndVerify(decimal_literal("1201", 38, 3), // x + decimal_literal("1801", 38, 3), // y + decimal_literal("3002", 38, 3)); // carry-over from fractional + + // max precision + AddAndVerify(decimal_literal("09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("100", 38, 7), // y + decimal_literal("99999999999999999999999999999990000010", 38, 6)); + + AddAndVerify(decimal_literal("-09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("100", 38, 7), // y + decimal_literal("-99999999999999999999999999999989999990", 38, 6)); + + AddAndVerify(decimal_literal("09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("-100", 38, 7), // y + decimal_literal("99999999999999999999999999999989999990", 38, 6)); + + AddAndVerify(decimal_literal("-09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("-100", 38, 7), // y + decimal_literal("-99999999999999999999999999999990000010", 38, 6)); + + AddAndVerify(decimal_literal("09999999999999999999999999999999999999", 38, 6), // x + decimal_literal("89999999999999999999999999999999999999", 38, 7), // y + decimal_literal("18999999999999999999999999999999999999", 38, 6)); + + // Both -ve + AddAndVerify(decimal_literal("-201", 30, 3), // x + decimal_literal("-301", 30, 2), // y + decimal_literal("-3211", 32, 3)); // expected + + AddAndVerify(decimal_literal("-201", 38, 3), // x + decimal_literal("-301", 38, 4), // y + decimal_literal("-2311", 38, 4)); // expected + + // Mix of +ve and -ve + AddAndVerify(decimal_literal("-201", 30, 3), // x + decimal_literal("301", 30, 2), // y + decimal_literal("2809", 32, 3)); // expected + + AddAndVerify(decimal_literal("-201", 38, 3), // x + decimal_literal("301", 38, 4), // y + decimal_literal("-1709", 38, 4)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("-301", 38, 7), // y + decimal_literal("200970", 38, 6)); // expected + + AddAndVerify(decimal_literal("-1901", 38, 4), // x + decimal_literal("1801", 38, 4), // y + decimal_literal("-100", 38, 4)); // expected + + AddAndVerify(decimal_literal("1801", 38, 4), // x + decimal_literal("-1901", 38, 4), // y + decimal_literal("-100", 38, 4)); // expected + + // rounding +ve + AddAndVerify(decimal_literal("1000999", 38, 6), // x + decimal_literal("10000999", 38, 7), // y + decimal_literal("2001099", 38, 6)); + + AddAndVerify(decimal_literal("1000999", 38, 6), // x + decimal_literal("10000995", 38, 7), // y + decimal_literal("2001099", 38, 6)); + + AddAndVerify(decimal_literal("1000999", 38, 6), // x + decimal_literal("10000992", 38, 7), // y + decimal_literal("2001098", 38, 6)); + + // rounding -ve + AddAndVerify(decimal_literal("-1000999", 38, 6), // x + decimal_literal("-10000999", 38, 7), // y + decimal_literal("-2001099", 38, 6)); + + AddAndVerify(decimal_literal("-1000999", 38, 6), // x + decimal_literal("-10000995", 38, 7), // y + decimal_literal("-2001099", 38, 6)); + + AddAndVerify(decimal_literal("-1000999", 38, 6), // x + decimal_literal("-10000992", 38, 7), // y + decimal_literal("-2001098", 38, 6)); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/decimal_test.cc b/cpp/src/gandiva/tests/decimal_test.cc new file mode 100644 index 0000000000000..da93b0e2d9da6 --- /dev/null +++ b/cpp/src/gandiva/tests/decimal_test.cc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/decimal.h" + +#include "gandiva/decimal_type_util.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +using arrow::Decimal128; + +namespace gandiva { + +class TestDecimal : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + std::vector MakeDecimalVector(std::vector values, + int32_t scale); + + protected: + arrow::MemoryPool* pool_; +}; + +std::vector TestDecimal::MakeDecimalVector(std::vector values, + int32_t scale) { + std::vector ret; + for (auto str : values) { + Decimal128 str_value; + int32_t str_precision; + int32_t str_scale; + + auto status = Decimal128::FromString(str, &str_value, &str_precision, &str_scale); + DCHECK_OK(status); + + Decimal128 scaled_value; + status = str_value.Rescale(str_scale, scale, &scaled_value); + ret.push_back(scaled_value); + } + return ret; +} + +TEST_F(TestDecimal, TestSimple) { + // schema for input fields + constexpr int32_t precision = 36; + constexpr int32_t scale = 18; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = field("a", decimal_type); + auto field_b = field("b", decimal_type); + auto field_c = field("c", decimal_type); + auto schema = arrow::schema({field_a, field_b, field_c}); + + Decimal128TypePtr add2_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {decimal_type, decimal_type}, &add2_type); + + Decimal128TypePtr output_type; + status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {add2_type, decimal_type}, &output_type); + + // output fields + auto res = field("res0", output_type); + + // build expression : a + b + c + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto node_c = TreeExprBuilder::MakeField(field_c); + auto add2 = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, add2_type); + auto add3 = TreeExprBuilder::MakeFunction("add", {add2, node_c}, output_type); + auto expr = TreeExprBuilder::MakeExpression(add3, res); + + // Build a projector for the expression. + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + DCHECK_OK(status); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale), + {false, true, true, true}); + auto array_b = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale), + {false, true, true, true}); + auto array_c = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"3", "4", "5", "6"}, scale), + {true, true, true, true}); + + // prepare input record batch + auto in_batch = + arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); + + auto expected = + MakeArrowArrayDecimal(output_type, MakeDecimalVector({"6", "9", "12", "15"}, scale), + {false, true, true, true}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + DCHECK_OK(status); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(expected, outputs[0]); +} + +TEST_F(TestDecimal, TestLiteral) { + // schema for input fields + constexpr int32_t precision = 36; + constexpr int32_t scale = 18; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = field("a", decimal_type); + auto schema = arrow::schema({ + field_a, + }); + + Decimal128TypePtr add2_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {decimal_type, decimal_type}, &add2_type); + + // output fields + auto res = field("res0", add2_type); + + // build expression : a + b + c + auto node_a = TreeExprBuilder::MakeField(field_a); + static std::string decimal_point_six = "6"; + DecimalScalar128 literal(decimal_point_six, 2, 1); + auto node_b = TreeExprBuilder::MakeDecimalLiteral(literal); + auto add2 = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, add2_type); + auto expr = TreeExprBuilder::MakeExpression(add2, res); + + // Build a projector for the expression. + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + DCHECK_OK(status); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale), + {false, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + auto expected = MakeArrowArrayDecimal( + add2_type, MakeDecimalVector({"1.6", "2.6", "3.6", "4.6"}, scale), + {false, true, true, true}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + DCHECK_OK(status); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(expected, outputs[0]); +} + +TEST_F(TestDecimal, TestIfElse) { + // schema for input fields + constexpr int32_t precision = 36; + constexpr int32_t scale = 18; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = field("a", decimal_type); + auto field_b = field("b", decimal_type); + auto field_c = field("c", arrow::boolean()); + auto schema = arrow::schema({field_a, field_b, field_c}); + + // output fields + auto field_result = field("res", decimal_type); + + // build expression. + // if (c) + // a + // else + // b + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto node_c = TreeExprBuilder::MakeField(field_c); + auto if_node = TreeExprBuilder::MakeIf(node_c, node_a, node_b, decimal_type); + + auto expr = TreeExprBuilder::MakeExpression(if_node, field_result); + + // Build a projector for the expressions. + std::shared_ptr projector; + Status status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + DCHECK_OK(status); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale), + {false, true, true, true}); + auto array_b = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale), + {true, true, true, true}); + + auto array_c = MakeArrowArrayBool({true, false, true, false}, {true, true, true, true}); + + // expected output + auto exp = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"0", "3", "3", "5"}, scale), + {false, true, true, true}); + + // prepare input record batch + auto in_batch = + arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + DCHECK_OK(status); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/filter_test.cc b/cpp/src/gandiva/tests/filter_test.cc index f95cdcc3fef9c..ee60388d5dc1f 100644 --- a/cpp/src/gandiva/tests/filter_test.cc +++ b/cpp/src/gandiva/tests/filter_test.cc @@ -50,14 +50,15 @@ TEST_F(TestFilter, TestFilterCache) { auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10}, arrow::boolean()); auto condition = TreeExprBuilder::MakeCondition(less_than_10); + auto configuration = TestConfiguration(); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, configuration, &filter); EXPECT_TRUE(status.ok()); // same schema and condition, should return the same filter as above. std::shared_ptr cached_filter; - status = Filter::Make(schema, condition, &cached_filter); + status = Filter::Make(schema, condition, configuration, &cached_filter); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() == filter.get()); @@ -65,7 +66,8 @@ TEST_F(TestFilter, TestFilterCache) { auto field2 = field("f2", int32()); auto different_schema = arrow::schema({field0, field1, field2}); std::shared_ptr should_be_new_filter; - status = Filter::Make(different_schema, condition, &should_be_new_filter); + status = + Filter::Make(different_schema, condition, configuration, &should_be_new_filter); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() != should_be_new_filter.get()); @@ -74,7 +76,7 @@ TEST_F(TestFilter, TestFilterCache) { "greater_than", {sum_func, literal_10}, arrow::boolean()); auto new_condition = TreeExprBuilder::MakeCondition(greater_than_10); std::shared_ptr should_be_new_filter1; - status = Filter::Make(schema, new_condition, &should_be_new_filter1); + status = Filter::Make(schema, new_condition, configuration, &should_be_new_filter1); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() != should_be_new_filter1.get()); } @@ -96,7 +98,7 @@ TEST_F(TestFilter, TestSimple) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -134,7 +136,7 @@ TEST_F(TestFilter, TestSimpleCustomConfig) { std::shared_ptr config = config_builder.build(); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -168,7 +170,7 @@ TEST_F(TestFilter, TestZeroCopy) { auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0}); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -208,7 +210,7 @@ TEST_F(TestFilter, TestZeroCopyNegative) { auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0}); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -265,7 +267,7 @@ TEST_F(TestFilter, TestSimpleSVInt32) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/generate_data.h b/cpp/src/gandiva/tests/generate_data.h index 01665b8ee17c5..398057510cb08 100644 --- a/cpp/src/gandiva/tests/generate_data.h +++ b/cpp/src/gandiva/tests/generate_data.h @@ -19,6 +19,8 @@ #include #include +#include "arrow/util/decimal.h" + #ifndef GANDIVA_GENERATE_DATA_H #define GANDIVA_GENERATE_DATA_H @@ -79,6 +81,24 @@ class Int64DataGenerator : public DataGenerator { Random random_; }; +class Decimal128DataGenerator : public DataGenerator { + public: + explicit Decimal128DataGenerator(bool large) : large_(large) {} + + arrow::Decimal128 GenerateData() { + uint64_t low = random_.next(); + int64_t high = random_.next(); + if (large_) { + high += (1ull << 62); + } + return arrow::Decimal128(high, low); + } + + protected: + bool large_; + Random random_; +}; + class FastUtf8DataGenerator : public DataGenerator { public: explicit FastUtf8DataGenerator(int max_len) : max_len_(max_len), cur_char_('a') {} diff --git a/cpp/src/gandiva/tests/hash_test.cc b/cpp/src/gandiva/tests/hash_test.cc index 96f92284a5ca1..afaa885dfe26b 100644 --- a/cpp/src/gandiva/tests/hash_test.cc +++ b/cpp/src/gandiva/tests/hash_test.cc @@ -61,7 +61,8 @@ TEST_F(TestHash, TestSimple) { // Build a projector for the expression. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_0, expr_1}, &projector); + auto status = + Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -113,7 +114,8 @@ TEST_F(TestHash, TestBuf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_0, expr_1}, &projector); + auto status = + Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/huge_table_test.cc b/cpp/src/gandiva/tests/huge_table_test.cc index bffcb1994707f..cecf290a1439f 100644 --- a/cpp/src/gandiva/tests/huge_table_test.cc +++ b/cpp/src/gandiva/tests/huge_table_test.cc @@ -58,7 +58,7 @@ TEST_F(DISABLED_TestHugeProjector, SimpleTestSumHuge) { // Build expression auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr}, &projector); + auto status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -136,7 +136,7 @@ TEST_F(DISABLED_TestHugeFilter, TestSimpleHugeFilter) { auto condition = TreeExprBuilder::MakeCondition(less_than_50); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // prepare input record batch diff --git a/cpp/src/gandiva/tests/if_expr_test.cc b/cpp/src/gandiva/tests/if_expr_test.cc index 93b35673b9467..54b6d43b4df1c 100644 --- a/cpp/src/gandiva/tests/if_expr_test.cc +++ b/cpp/src/gandiva/tests/if_expr_test.cc @@ -61,7 +61,7 @@ TEST_F(TestIfExpr, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -110,7 +110,7 @@ TEST_F(TestIfExpr, TestSimpleArithmetic) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -165,7 +165,7 @@ TEST_F(TestIfExpr, TestNested) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -228,7 +228,7 @@ TEST_F(TestIfExpr, TestNestedInIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -296,7 +296,7 @@ TEST_F(TestIfExpr, TestNestedInCondition) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -353,7 +353,7 @@ TEST_F(TestIfExpr, TestBigNested) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/in_expr_test.cc b/cpp/src/gandiva/tests/in_expr_test.cc index 13ef97cfb8814..2103874cb1e2c 100644 --- a/cpp/src/gandiva/tests/in_expr_test.cc +++ b/cpp/src/gandiva/tests/in_expr_test.cc @@ -51,7 +51,7 @@ TEST_F(TestIn, TestInSimple) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -88,7 +88,7 @@ TEST_F(TestIn, TestInString) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -125,7 +125,7 @@ TEST_F(TestIn, TestInStringValidationError) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Evaluation expression for IN clause returns "; diff --git a/cpp/src/gandiva/tests/literal_test.cc b/cpp/src/gandiva/tests/literal_test.cc index ced66452a2d45..53323cb4e7cbb 100644 --- a/cpp/src/gandiva/tests/literal_test.cc +++ b/cpp/src/gandiva/tests/literal_test.cc @@ -88,8 +88,8 @@ TEST_F(TestLiteral, TestSimpleArithmetic) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = - Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e}, &projector); + auto status = Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e}, + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -133,7 +133,7 @@ TEST_F(TestLiteral, TestLiteralHash) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); auto res1 = field("a", int64()); @@ -142,7 +142,7 @@ TEST_F(TestLiteral, TestLiteralHash) { // Build a projector for the expressions. std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector.get() != projector1.get()); } @@ -165,7 +165,7 @@ TEST_F(TestLiteral, TestNullLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -207,7 +207,7 @@ TEST_F(TestLiteral, TestNullLiteralInIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index 7d844eb378bf8..e0794a233a2ce 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -19,6 +19,7 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" #include "benchmark/benchmark.h" +#include "gandiva/decimal_type_util.h" #include "gandiva/projector.h" #include "gandiva/tests/test_util.h" #include "gandiva/tests/timed_evaluate.h" @@ -31,10 +32,6 @@ using arrow::int32; using arrow::int64; using arrow::utf8; -// TODO : the base numbers are from a mac. they need to be caliberated -// for the hardware used by travis. -float tolerance_ratio = 6.0; - static void TimedTestAdd3(benchmark::State& state) { // schema for input fields auto field0 = field("f0", int64()); @@ -56,7 +53,7 @@ static void TimedTestAdd3(benchmark::State& state) { auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {sum_expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector)); Int64DataGenerator data_generator; ProjectEvaluator evaluator(projector); @@ -99,7 +96,7 @@ static void TimedTestBigNested(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); BoundedInt32DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -122,7 +119,7 @@ static void TimedTestExtractYear(benchmark::State& state) { auto expr = TreeExprBuilder::MakeExpression("extractYear", {field0}, field_res); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); Int64DataGenerator data_generator; ProjectEvaluator evaluator(projector); @@ -149,7 +146,7 @@ static void TimedTestFilterAdd2(benchmark::State& state) { auto condition = TreeExprBuilder::MakeCondition(less_than); std::shared_ptr filter; - ASSERT_OK(Filter::Make(schema, condition, &filter)); + ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter)); Int64DataGenerator data_generator; FilterEvaluator evaluator(filter); @@ -173,7 +170,7 @@ static void TimedTestFilterLike(benchmark::State& state) { auto condition = TreeExprBuilder::MakeCondition(like_yellow); std::shared_ptr filter; - ASSERT_OK(Filter::Make(schema, condition, &filter)); + ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter)); FastUtf8DataGenerator data_generator(32); FilterEvaluator evaluator(filter); @@ -199,7 +196,7 @@ static void TimedTestAllocs(benchmark::State& state) { auto expr = TreeExprBuilder::MakeExpression(length, field_res); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(64); ProjectEvaluator evaluator(projector); @@ -237,7 +234,7 @@ static void TimedTestMultiOr(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -269,7 +266,7 @@ static void TimedTestInExpr(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -280,6 +277,119 @@ static void TimedTestInExpr(benchmark::State& state) { ASSERT_OK(status); } +static void DoDecimalAdd3(benchmark::State& state, int32_t precision, int32_t scale, + bool large = false) { + // schema for input fields + auto decimal_type = std::make_shared(precision, scale); + auto field0 = field("f0", decimal_type); + auto field1 = field("f1", decimal_type); + auto field2 = field("f2", decimal_type); + auto schema = arrow::schema({field0, field1, field2}); + + Decimal128TypePtr add2_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {decimal_type, decimal_type}, &add2_type); + + Decimal128TypePtr output_type; + status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {add2_type, decimal_type}, &output_type); + + // output field + auto field_sum = field("add", output_type); + + // Build expression + auto part_sum = TreeExprBuilder::MakeFunction( + "add", {TreeExprBuilder::MakeField(field1), TreeExprBuilder::MakeField(field2)}, + add2_type); + auto sum = TreeExprBuilder::MakeFunction( + "add", {TreeExprBuilder::MakeField(field0), part_sum}, output_type); + + auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum); + + std::shared_ptr projector; + status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()); + + Decimal128DataGenerator data_generator(large); + ProjectEvaluator evaluator(projector); + + status = TimedEvaluate( + schema, evaluator, data_generator, arrow::default_memory_pool(), 1 * MILLION, + 16 * THOUSAND, state); + ASSERT_OK(status); +} + +static void DoDecimalAdd2(benchmark::State& state, int32_t precision, int32_t scale, + bool large = false) { + // schema for input fields + auto decimal_type = std::make_shared(precision, scale); + auto field0 = field("f0", decimal_type); + auto field1 = field("f1", decimal_type); + auto schema = arrow::schema({field0, field1}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType( + DecimalTypeUtil::kOpAdd, {decimal_type, decimal_type}, &output_type); + + // output field + auto field_sum = field("add", output_type); + + // Build expression + auto sum = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); + + std::shared_ptr projector; + status = Projector::Make(schema, {sum}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()); + + Decimal128DataGenerator data_generator(large); + ProjectEvaluator evaluator(projector); + + status = TimedEvaluate( + schema, evaluator, data_generator, arrow::default_memory_pool(), 1 * MILLION, + 16 * THOUSAND, state); + ASSERT_OK(status); +} + +static void DecimalAdd2Fast(benchmark::State& state) { + // use lesser precision to test the fast-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision - 6, 18); +} + +static void DecimalAdd2LeadingZeroes(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 6); +} + +static void DecimalAdd2LeadingZeroesWithDiv(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 18); +} + +static void DecimalAdd2Large(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 18, true); +} + +static void DecimalAdd3Fast(benchmark::State& state) { + // use lesser precision to test the fast-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision - 6, 18); +} + +static void DecimalAdd3LeadingZeroes(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 6); +} + +static void DecimalAdd3LeadingZeroesWithDiv(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18); +} + +static void DecimalAdd3Large(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18, true); +} + BENCHMARK(TimedTestAdd3)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestBigNested)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestBigNested)->MinTime(1.0)->Unit(benchmark::kMicrosecond); @@ -289,5 +399,13 @@ BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); } // namespace gandiva diff --git a/cpp/src/gandiva/tests/null_validity_test.cc b/cpp/src/gandiva/tests/null_validity_test.cc index 06cfdc08ba906..0374b68d46288 100644 --- a/cpp/src/gandiva/tests/null_validity_test.cc +++ b/cpp/src/gandiva/tests/null_validity_test.cc @@ -60,7 +60,7 @@ TEST_F(TestNullValidity, TestFunc) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -111,7 +111,7 @@ TEST_F(TestNullValidity, TestIfElse) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -148,7 +148,7 @@ TEST_F(TestNullValidity, TestUtf8) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index ddcb729b3bfee..6c4eef53ded68 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -50,7 +50,7 @@ TEST_F(TestProjector, TestNonExistentFunction) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Function bool non_existent_function(float, float) not supported yet."; @@ -71,7 +71,7 @@ TEST_F(TestProjector, TestNotMatchingDataType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Return type of root node float does not match that of expression bool"; @@ -92,7 +92,7 @@ TEST_F(TestProjector, TestNotSupportedDataType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field f0 has unsupported data type list"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); @@ -113,7 +113,7 @@ TEST_F(TestProjector, TestIncorrectSchemaMissingField) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field f2 not in schema"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); @@ -135,7 +135,7 @@ TEST_F(TestProjector, TestIncorrectSchemaTypeNotMatching) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field definition in schema f2: int32 different from field in expression f2: float"; @@ -166,7 +166,7 @@ TEST_F(TestProjector, TestIfNotSupportedFunction) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -189,10 +189,8 @@ TEST_F(TestProjector, TestIfNotMatchingReturnType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Return type of if bool and then int32 not matching."; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestElseNotMatchingReturnType) { @@ -216,10 +214,8 @@ TEST_F(TestProjector, TestElseNotMatchingReturnType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Return type of if int32 and else bool not matching."; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestElseNotSupportedType) { @@ -243,10 +239,9 @@ TEST_F(TestProjector, TestElseNotSupportedType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Field c has unsupported data type list"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); + EXPECT_EQ(status.code(), StatusCode::ExpressionValidationError); } TEST_F(TestProjector, TestAndMinChildren) { @@ -264,10 +259,8 @@ TEST_F(TestProjector, TestAndMinChildren) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Boolean expression has 1 children, expected atleast two"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestAndBooleanArgType) { @@ -287,12 +280,8 @@ TEST_F(TestProjector, TestAndBooleanArgType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = - "Boolean expression has a child with return type int32, expected return type " - "boolean"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } } // namespace gandiva diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index becaf8f1ba3d7..5c32f5024dba0 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -50,31 +50,51 @@ TEST_F(TestProjector, TestProjectCache) { auto sub_expr = TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); + auto configuration = TestConfiguration(); + std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr, sub_expr}, &projector); - EXPECT_TRUE(status.ok()); + auto status = Projector::Make(schema, {sum_expr, sub_expr}, configuration, &projector); + ASSERT_OK(status); // everything is same, should return the same projector. auto schema_same = arrow::schema({field0, field1}); std::shared_ptr cached_projector; - status = Projector::Make(schema_same, {sum_expr, sub_expr}, &cached_projector); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(cached_projector.get() == projector.get()); + status = Projector::Make(schema_same, {sum_expr, sub_expr}, configuration, + &cached_projector); + ASSERT_OK(status); + EXPECT_EQ(cached_projector, projector); // schema is different should return a new projector. auto field2 = field("f2", int32()); auto different_schema = arrow::schema({field0, field1, field2}); std::shared_ptr should_be_new_projector; - status = - Projector::Make(different_schema, {sum_expr, sub_expr}, &should_be_new_projector); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(cached_projector.get() != should_be_new_projector.get()); + status = Projector::Make(different_schema, {sum_expr, sub_expr}, configuration, + &should_be_new_projector); + ASSERT_OK(status); + EXPECT_NE(cached_projector, should_be_new_projector); // expression list is different should return a new projector. std::shared_ptr should_be_new_projector1; - status = Projector::Make(schema, {sum_expr}, &should_be_new_projector1); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(cached_projector.get() != should_be_new_projector1.get()); + status = Projector::Make(schema, {sum_expr}, configuration, &should_be_new_projector1); + ASSERT_OK(status); + EXPECT_NE(cached_projector, should_be_new_projector1); + + // another instance of the same configuration, should return the same projector. + status = Projector::Make(schema, {sum_expr, sub_expr}, TestConfiguration(), + &cached_projector); + ASSERT_OK(status); + EXPECT_EQ(cached_projector, projector); + + // if configuration is different, should return a new projector. + auto other_configuration = + ConfigurationBuilder() + .set_byte_code_file_path("/" + std::string(GANDIVA_BYTE_COMPILE_FILE_PATH)) + .build(); + std::shared_ptr should_be_new_projector2; + status = Projector::Make(schema, {sum_expr, sub_expr}, other_configuration, + &should_be_new_projector2); + ASSERT_OK(status); + EXPECT_NE(projector, should_be_new_projector2); } TEST_F(TestProjector, TestProjectCacheFieldNames) { @@ -90,12 +110,13 @@ TEST_F(TestProjector, TestProjectCacheFieldNames) { auto sum_expr_01 = TreeExprBuilder::MakeExpression("add", {field0, field1}, sum_01); std::shared_ptr projector_01; - Status status = Projector::Make(schema, {sum_expr_01}, &projector_01); + auto status = + Projector::Make(schema, {sum_expr_01}, TestConfiguration(), &projector_01); EXPECT_TRUE(status.ok()); auto sum_expr_12 = TreeExprBuilder::MakeExpression("add", {field1, field2}, sum_12); std::shared_ptr projector_12; - status = Projector::Make(schema, {sum_expr_12}, &projector_12); + status = Projector::Make(schema, {sum_expr_12}, TestConfiguration(), &projector_12); EXPECT_TRUE(status.ok()); // add(f0, f1) != add(f1, f2) @@ -111,14 +132,16 @@ TEST_F(TestProjector, TestProjectCacheDouble) { auto literal0 = TreeExprBuilder::MakeLiteral(d0); auto expr0 = TreeExprBuilder::MakeExpression(literal0, res); + auto configuration = TestConfiguration(); + std::shared_ptr projector0; - auto status = Projector::Make(schema, {expr0}, &projector0); + auto status = Projector::Make(schema, {expr0}, configuration, &projector0); EXPECT_TRUE(status.ok()) << status.message(); auto literal1 = TreeExprBuilder::MakeLiteral(d1); auto expr1 = TreeExprBuilder::MakeExpression(literal1, res); std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, configuration, &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector0.get() != projector1.get()); @@ -134,13 +157,13 @@ TEST_F(TestProjector, TestProjectCacheFloat) { auto literal0 = TreeExprBuilder::MakeLiteral(f0); auto expr0 = TreeExprBuilder::MakeExpression(literal0, res); std::shared_ptr projector0; - auto status = Projector::Make(schema, {expr0}, &projector0); + auto status = Projector::Make(schema, {expr0}, TestConfiguration(), &projector0); EXPECT_TRUE(status.ok()) << status.message(); auto literal1 = TreeExprBuilder::MakeLiteral(f1); auto expr1 = TreeExprBuilder::MakeExpression(literal1, res); std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector0.get() != projector1.get()); @@ -162,50 +185,8 @@ TEST_F(TestProjector, TestIntSumSub) { TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr, sub_expr}, &projector); - EXPECT_TRUE(status.ok()); - - // Create a row-batch with some sample data - int num_records = 4; - auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false}); - auto array1 = MakeArrowArrayInt32({11, 13, 15, 17}, {true, true, false, true}); - // expected output - auto exp_sum = MakeArrowArrayInt32({12, 15, 0, 0}, {true, true, false, false}); - auto exp_sub = MakeArrowArrayInt32({-10, -11, 0, 0}, {true, true, false, false}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()); - - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0)); - EXPECT_ARROW_ARRAY_EQUALS(exp_sub, outputs.at(1)); -} - -TEST_F(TestProjector, TestIntSumSubCustomConfig) { - // schema for input fields - auto field0 = field("f0", int32()); - auto field1 = field("f2", int32()); - auto schema = arrow::schema({field0, field1}); - - // output fields - auto field_sum = field("add", int32()); - auto field_sub = field("subtract", int32()); - - // Build expression - auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); - auto sub_expr = - TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); - - std::shared_ptr projector; - ConfigurationBuilder config_builder; - std::shared_ptr config = config_builder.build(); - - Status status = Projector::Make(schema, {sum_expr, sub_expr}, config, &projector); + auto status = + Projector::Make(schema, {sum_expr, sub_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -257,15 +238,17 @@ static void TestArithmeticOpsForType(arrow::MemoryPool* pool) { auto lt_expr = TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_lt); std::shared_ptr projector; - Status status = Projector::Make( - schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr}, &projector); + auto status = + Projector::Make(schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr}, + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data - int num_records = 4; - std::vector input0 = {1, 2, 53, 84}; - std::vector input1 = {10, 15, 23, 84}; - std::vector validity = {true, true, true, true}; + int num_records = 12; + std::vector input0 = {1, 2, 53, 84, 5, 15, 0, 1, 52, 83, 4, 120}; + std::vector input1 = {10, 15, 23, 84, 4, 51, 68, 9, 16, 18, 19, 37}; + std::vector validity = {true, true, true, true, true, true, + true, true, true, true, true, true}; auto array0 = MakeArrowArray(input0, validity); auto array1 = MakeArrowArray(input1, validity); @@ -344,9 +327,9 @@ TEST_F(TestProjector, TestExtendedMath) { TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power); std::shared_ptr projector; - Status status = Projector::Make( + auto status = Projector::Make( schema, {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr}, - &projector); + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -412,7 +395,7 @@ TEST_F(TestProjector, TestFloatLessThan) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -447,7 +430,7 @@ TEST_F(TestProjector, TestIsNotNull) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {myexpr}, &projector); + auto status = Projector::Make(schema, {myexpr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -480,7 +463,7 @@ TEST_F(TestProjector, TestZeroCopy) { auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res); std::shared_ptr projector; - Status status = Projector::Make(schema, {cast_expr}, &projector); + auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -493,14 +476,15 @@ TEST_F(TestProjector, TestZeroCopy) { // allocate output buffers int64_t bitmap_sz = arrow::BitUtil::BytesForBits(num_records); - std::unique_ptr bitmap(new uint8_t[bitmap_sz]); + int64_t bitmap_capacity = arrow::BitUtil::RoundUpToMultipleOf64(bitmap_sz); + std::vector bitmap(bitmap_capacity); std::shared_ptr bitmap_buf = - std::make_shared(bitmap.get(), bitmap_sz); + std::make_shared(&bitmap[0], bitmap_capacity); int64_t data_sz = sizeof(float) * num_records; - std::unique_ptr data(new uint8_t[data_sz]); + std::vector data(bitmap_capacity); std::shared_ptr data_buf = - std::make_shared(data.get(), data_sz); + std::make_shared(&data[0], data_sz); auto array_data = arrow::ArrayData::Make(float32(), num_records, {bitmap_buf, data_buf}); @@ -526,7 +510,7 @@ TEST_F(TestProjector, TestZeroCopyNegative) { auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res); std::shared_ptr projector; - Status status = Projector::Make(schema, {cast_expr}, &projector); + auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -596,7 +580,7 @@ TEST_F(TestProjector, TestDivideZero) { auto div_expr = TreeExprBuilder::MakeExpression("divide", {field0, field1}, field_div); std::shared_ptr projector; - Status status = Projector::Make(schema, {div_expr}, &projector); + auto status = Projector::Make(schema, {div_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -645,7 +629,7 @@ TEST_F(TestProjector, TestModZero) { auto mod_expr = TreeExprBuilder::MakeExpression("mod", {field0, field1}, field_div); std::shared_ptr projector; - Status status = Projector::Make(schema, {mod_expr}, &projector); + auto status = Projector::Make(schema, {mod_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/test_util.h b/cpp/src/gandiva/tests/test_util.h index d24448727bd83..0e0e27a0c9aa4 100644 --- a/cpp/src/gandiva/tests/test_util.h +++ b/cpp/src/gandiva/tests/test_util.h @@ -21,6 +21,7 @@ #include #include "arrow/test-util.h" #include "gandiva/arrow.h" +#include "gandiva/configuration.h" #ifndef GANDIVA_TEST_UTIL_H #define GANDIVA_TEST_UTIL_H @@ -46,6 +47,14 @@ static ArrayPtr MakeArrowArray(std::vector values) { return out; } +template +static ArrayPtr MakeArrowArray(const std::shared_ptr& type, + std::vector values, std::vector validity) { + ArrayPtr out; + arrow::ArrayFromVector(type, validity, values, &out); + return out; +} + template static ArrayPtr MakeArrowTypeArray(const std::shared_ptr& type, const std::vector& values, @@ -68,11 +77,22 @@ static ArrayPtr MakeArrowTypeArray(const std::shared_ptr& type, #define MakeArrowArrayFloat64 MakeArrowArray #define MakeArrowArrayUtf8 MakeArrowArray #define MakeArrowArrayBinary MakeArrowArray +#define MakeArrowArrayDecimal MakeArrowArray #define EXPECT_ARROW_ARRAY_EQUALS(a, b) \ EXPECT_TRUE((a)->Equals(b)) << "expected array: " << (a)->ToString() \ << " actual array: " << (b)->ToString(); +#define EXPECT_ARROW_TYPE_EQUALS(a, b) \ + EXPECT_TRUE((a)->Equals(b)) << "expected type: " << (a)->ToString() \ + << " actual type: " << (b)->ToString(); + +std::shared_ptr TestConfiguration() { + auto builder = ConfigurationBuilder(); + builder.set_byte_code_file_path(GANDIVA_BYTE_COMPILE_FILE_PATH); + return builder.build(); +} + } // namespace gandiva #endif // GANDIVA_TEST_UTIL_H diff --git a/cpp/src/gandiva/tests/timed_evaluate.h b/cpp/src/gandiva/tests/timed_evaluate.h index dab47c2f218be..9db7d88d2a226 100644 --- a/cpp/src/gandiva/tests/timed_evaluate.h +++ b/cpp/src/gandiva/tests/timed_evaluate.h @@ -100,7 +100,9 @@ Status TimedEvaluate(SchemaPtr schema, BaseEvaluator& evaluator, for (int col = 0; col < num_fields; col++) { std::vector data = GenerateData(batch_size, data_generator); std::vector validity(batch_size, true); - ArrayPtr col_data = MakeArrowArray(data, validity); + ArrayPtr col_data = + MakeArrowArray(schema->field(col)->type(), data, validity); + columns.push_back(col_data); } diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 8b09b72f32d03..925ceea836280 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -67,7 +67,8 @@ TEST_F(TestUtf8, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_a, expr_b, expr_c}, &projector); + auto status = + Projector::Make(schema, {expr_a, expr_b, expr_c}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -113,7 +114,7 @@ TEST_F(TestUtf8, TestLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -155,7 +156,7 @@ TEST_F(TestUtf8, TestNullLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -197,7 +198,7 @@ TEST_F(TestUtf8, TestLike) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -245,7 +246,7 @@ TEST_F(TestUtf8, TestBeginsEnds) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr1, expr2}, &projector); + auto status = Projector::Make(schema, {expr1, expr2}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -291,7 +292,7 @@ TEST_F(TestUtf8, TestInternalAllocs) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -334,7 +335,7 @@ TEST_F(TestUtf8, TestCastDate) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -389,7 +390,7 @@ TEST_F(TestUtf8, TestToDateNoError) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -444,7 +445,7 @@ TEST_F(TestUtf8, TestToDateError) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/to_date_holder.cc b/cpp/src/gandiva/to_date_holder.cc index 9c8562280041d..f73d05fb71600 100644 --- a/cpp/src/gandiva/to_date_holder.cc +++ b/cpp/src/gandiva/to_date_holder.cc @@ -18,7 +18,7 @@ #include #include -#include "arrow/util/date.h" +#include "arrow/vendored/date.h" #include "gandiva/date_utils.h" #include "gandiva/execution_context.h" @@ -44,7 +44,7 @@ Status ToDateHolder::Make(const FunctionNode& node, return Status::Invalid( "'to_date' function requires a string literal as the second parameter"); } - auto pattern = boost::get(literal_pattern->holder()); + auto pattern = literal_pattern->holder().get(); auto literal_suppress_errors = dynamic_cast(node.children().at(2).get()); if (literal_pattern == nullptr) { @@ -57,7 +57,7 @@ Status ToDateHolder::Make(const FunctionNode& node, return Status::Invalid( "'to_date' function requires a int literal as the third parameter"); } - auto suppress_errors = boost::get(literal_suppress_errors->holder()); + auto suppress_errors = literal_suppress_errors->holder().get(); return Make(pattern, suppress_errors, holder); } diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 86a2824075497..a63b700c2eeae 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -19,6 +19,7 @@ #include +#include "gandiva/decimal_type_util.h" #include "gandiva/gandiva_aliases.h" #include "gandiva/node.h" @@ -49,6 +50,11 @@ NodePtr TreeExprBuilder::MakeBinaryLiteral(const std::string& value) { return std::make_shared(arrow::binary(), LiteralHolder(value), false); } +NodePtr TreeExprBuilder::MakeDecimalLiteral(const DecimalScalar128& value) { + return std::make_shared(arrow::decimal(value.precision(), value.scale()), + LiteralHolder(value), false); +} + NodePtr TreeExprBuilder::MakeNull(DataTypePtr data_type) { static const std::string empty; @@ -92,6 +98,10 @@ NodePtr TreeExprBuilder::MakeNull(DataTypePtr data_type) { return std::make_shared(data_type, LiteralHolder((int64_t)0), true); case arrow::Type::TIMESTAMP: return std::make_shared(data_type, LiteralHolder((int64_t)0), true); + case arrow::Type::DECIMAL: { + DecimalScalar128 literal(0, 0); + return std::make_shared(data_type, LiteralHolder(literal), true); + } default: return nullptr; } diff --git a/cpp/src/gandiva/tree_expr_builder.h b/cpp/src/gandiva/tree_expr_builder.h index cd261c8bf978d..3d60b5b96168d 100644 --- a/cpp/src/gandiva/tree_expr_builder.h +++ b/cpp/src/gandiva/tree_expr_builder.h @@ -23,7 +23,9 @@ #include #include +#include "arrow/type.h" #include "gandiva/condition.h" +#include "gandiva/decimal_scalar.h" #include "gandiva/expression.h" namespace gandiva { @@ -45,6 +47,7 @@ class TreeExprBuilder { static NodePtr MakeLiteral(double value); static NodePtr MakeStringLiteral(const std::string& value); static NodePtr MakeBinaryLiteral(const std::string& value); + static NodePtr MakeDecimalLiteral(const DecimalScalar128& value); /// \brief create a node on a null literal. /// returns null if data_type is null or if it's not a supported datatype. diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 1538b58164b62..e3294bdee4dbb 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(parquet-all) +add_custom_target(parquet) +add_custom_target(parquet-benchmarks) +add_custom_target(parquet-tests) +add_dependencies(parquet-all parquet parquet-tests parquet-benchmarks) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/.parquetcppversion" PARQUET_VERSION) string(REPLACE "\n" "" PARQUET_VERSION "${PARQUET_VERSION}") string(REGEX MATCH "^([0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+)?)" VERSION ${PARQUET_VERSION}) @@ -22,9 +28,6 @@ if(NOT VERSION) message(FATAL_ERROR "invalid .parquetcppversion") endif() -# For "make parquet" to build everything Parquet-related -add_custom_target(parquet) - function(ADD_PARQUET_TEST REL_TEST_NAME) set(options USE_STATIC_LINKING) set(one_value_args) @@ -34,20 +37,39 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() + set(TEST_ARGUMENTS + PREFIX "parquet" + LABELS "parquet-tests") + # By default we prefer shared linking with libparquet, as it's faster # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_STATIC_TEST_LINK_LIBS} - PREFIX "parquet" - LABELS "unittest;parquet") + ${TEST_ARGUMENTS}) else() - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_SHARED_TEST_LINK_LIBS} - PREFIX "parquet" - LABELS "unittest;parquet") + ${TEST_ARGUMENTS}) + endif() +endfunction() + +function(ADD_PARQUET_BENCHMARK REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "parquet") endif() + ADD_BENCHMARK(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS "parquet-benchmarks" + ${PARQUET_BENCHMARK_LINK_OPTION} + ${ARG_UNPARSED_ARGUMENTS}) endfunction() # ---------------------------------------------------------------------- @@ -76,8 +98,8 @@ if(MSVC) endif() set(PARQUET_MIN_TEST_LIBS - gtest_main_static - gtest_static) + ${GTEST_MAIN_LIBRARY} + ${GTEST_LIBRARY}) if (APPLE) set(PARQUET_MIN_TEST_LIBS @@ -101,9 +123,15 @@ set(PARQUET_STATIC_TEST_LINK_LIBS ${ARROW_LIBRARIES_FOR_STATIC_TESTS} parquet_static) -set(PARQUET_BENCHMARK_LINK_LIBRARIES - arrow_benchmark_main - parquet_shared) +if (WIN32) + # The benchmarks depend on some static Thrift symbols + set(PARQUET_BENCHMARK_LINK_OPTION + STATIC_LINK_LIBS arrow_benchmark_main + parquet_static) +else() + set(PARQUET_BENCHMARK_LINK_OPTION + EXTRA_LINK_LIBS parquet_shared) +endif() ############################################################ # Generated Thrift sources @@ -144,6 +172,7 @@ set(PARQUET_SRCS column_reader.cc column_scanner.cc column_writer.cc + encoding.cc file_reader.cc file_writer.cc metadata.cc @@ -185,12 +214,19 @@ if (NOT PARQUET_MINIMAL_DEPENDENCY) # Although we don't link parquet_objlib against anything, we need it to depend # on these libs as we may generate their headers via ExternalProject_Add -set(PARQUET_DEPENDENCIES ${PARQUET_DEPENDENCIES} - ${PARQUET_SHARED_LINK_LIBS} - ${PARQUET_SHARED_PRIVATE_LINK_LIBS} - ${PARQUET_STATIC_LINK_LIBS}) +if (ARROW_BUILD_SHARED) + set(PARQUET_DEPENDENCIES ${PARQUET_DEPENDENCIES} + ${PARQUET_SHARED_LINK_LIBS} + ${PARQUET_SHARED_PRIVATE_LINK_LIBS}) endif() +if (ARROW_BUILD_STATIC) + set(PARQUET_DEPENDENCIES ${PARQUET_DEPENDENCIES} + ${PARQUET_STATIC_LINK_LIBS}) +endif() + +endif(NOT PARQUET_MINIMAL_DEPENDENCY) + if(NOT APPLE AND NOT MSVC) # Localize thirdparty symbols using a linker version script. This hides them # from the client application. The OS X linker does not support the @@ -217,6 +253,8 @@ ADD_ARROW_LIB(parquet STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS} ) +add_dependencies(parquet ${PARQUET_LIBRARIES}) + # Thrift requires these definitions for some types that we use foreach(LIB_TARGET ${PARQUET_LIBRARIES}) target_compile_definitions(${LIB_TARGET} @@ -232,32 +270,18 @@ foreach(LIB_TARGET ${PARQUET_LIBRARIES}) endif() endforeach() -add_dependencies(parquet ${PARQUET_LIBRARIES}) +# We always build the Parquet static libraries (see PARQUET-1420) so we add the +# PARQUET_STATIC public compile definition if we are building the unit tests OR +# if we are building the static library +if (WIN32 AND (NOT NO_TESTS OR ARROW_BUILD_STATIC)) + target_compile_definitions(parquet_static PUBLIC PARQUET_STATIC) +endif() add_subdirectory(api) add_subdirectory(arrow) add_subdirectory(util) -# Headers: top level -install(FILES - bloom_filter.h - column_reader.h - column_page.h - column_scanner.h - column_writer.h - encoding.h - exception.h - file_reader.h - file_writer.h - hasher.h - metadata.h - murmur3.h - printer.h - properties.h - schema.h - statistics.h - types.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") +ARROW_INSTALL_ALL_HEADERS("parquet") configure_file(parquet_version.h.in "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" @@ -268,13 +292,7 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") # pkg-config support -configure_file(parquet.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/parquet.pc" - @ONLY) - -install(FILES - "${CMAKE_CURRENT_BINARY_DIR}/parquet.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("parquet") ADD_PARQUET_TEST(bloom_filter-test) ADD_PARQUET_TEST(column_reader-test) @@ -294,9 +312,10 @@ ADD_PARQUET_TEST(reader-test) ADD_PARQUET_TEST(file-deserialize-test USE_STATIC_LINKING) ADD_PARQUET_TEST(schema-test USE_STATIC_LINKING) -ADD_ARROW_BENCHMARK(column-io-benchmark - PREFIX "parquet" - EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) -ADD_ARROW_BENCHMARK(encoding-benchmark - PREFIX "parquet" - EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) +ADD_PARQUET_BENCHMARK(column-io-benchmark) +ADD_PARQUET_BENCHMARK(encoding-benchmark) + +# Required for tests, the ExternalProject for zstd does not build on CMake < 3.7 +if (ARROW_WITH_ZSTD) + add_definitions(-DARROW_WITH_ZSTD) +endif() diff --git a/cpp/src/parquet/api/CMakeLists.txt b/cpp/src/parquet/api/CMakeLists.txt index 79fc716952a16..48fddb9d61ddf 100644 --- a/cpp/src/parquet/api/CMakeLists.txt +++ b/cpp/src/parquet/api/CMakeLists.txt @@ -16,9 +16,4 @@ # under the License. # Headers: public api -install(FILES - io.h - reader.h - writer.h - schema.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/api") +ARROW_INSTALL_ALL_HEADERS("parquet/api") diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 429dadcd37e5e..ba9e93df7b87a 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -18,13 +18,7 @@ ADD_PARQUET_TEST(arrow-schema-test) ADD_PARQUET_TEST(arrow-reader-writer-test) -ADD_ARROW_BENCHMARK(reader-writer-benchmark - PREFIX "parquet-arrow" - EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) +ADD_PARQUET_BENCHMARK(reader-writer-benchmark + PREFIX "parquet-arrow") -# Headers: top level -install(FILES - reader.h - schema.h - writer.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/arrow") +ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 24ec0dd24eec3..bb9763224f3ba 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -464,7 +464,11 @@ class TestParquetIO : public ::testing::Test { ASSERT_OK_NO_THROW(file_reader->GetColumn(0, &column_reader)); ASSERT_NE(nullptr, column_reader.get()); - ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); + std::shared_ptr chunked_out; + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, &chunked_out)); + + ASSERT_EQ(1, chunked_out->num_chunks()); + *out = chunked_out->chunk(0); ASSERT_NE(nullptr, out->get()); } @@ -1189,65 +1193,116 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool nanos_as_micros = auto f0 = field("f0", ::arrow::date32()); auto f1 = field("f1", ::arrow::timestamp(TimeUnit::MILLI)); auto f2 = field("f2", ::arrow::timestamp(TimeUnit::MICRO)); - std::shared_ptr<::arrow::Field> f3; - if (nanos_as_micros) { - f3 = field("f3", ::arrow::timestamp(TimeUnit::MICRO)); - } else { - f3 = field("f3", ::arrow::timestamp(TimeUnit::NANO)); - } + auto f3_unit = nanos_as_micros ? TimeUnit::MICRO : TimeUnit::NANO; + auto f3 = field("f3", ::arrow::timestamp(f3_unit)); auto f4 = field("f4", ::arrow::time32(TimeUnit::MILLI)); auto f5 = field("f5", ::arrow::time64(TimeUnit::MICRO)); + std::shared_ptr<::arrow::Schema> schema(new ::arrow::Schema({f0, f1, f2, f3, f4, f5})); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; - std::vector t64_values = {1489269000000, 1489270000000, 1489271000000, - 1489272000000, 1489272000000, 1489273000000}; + std::vector t64_ns_values = {1489269000000, 1489270000000, 1489271000000, + 1489272000000, 1489272000000, 1489273000000}; std::vector t64_us_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; + std::vector t64_ms_values = {1489269, 1489270, 1489271, + 1489272, 1489272, 1489273}; std::shared_ptr a0, a1, a2, a3, a4, a5; ArrayFromVector<::arrow::Date32Type, int32_t>(f0->type(), is_valid, t32_values, &a0); - ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_values, &a1); - ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_values, &a2); - if (nanos_as_micros) { - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_us_values, - &a3); - } else { - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_values, - &a3); - } + ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_ms_values, + &a1); + ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_us_values, + &a2); + auto f3_data = nanos_as_micros ? t64_us_values : t64_ns_values; + ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, f3_data, &a3); ArrayFromVector<::arrow::Time32Type, int32_t>(f4->type(), is_valid, t32_values, &a4); - ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_values, &a5); + ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); std::vector> columns = { std::make_shared("f0", a0), std::make_shared("f1", a1), std::make_shared("f2", a2), std::make_shared("f3", a3), std::make_shared("f4", a4), std::make_shared("f5", a5)}; + *out = Table::Make(schema, columns); } TEST(TestArrowReadWrite, DateTimeTypes) { - std::shared_ptr
table; + std::shared_ptr
table, result; MakeDateTimeTypesTable(&table); - // Use deprecated INT96 type - std::shared_ptr
result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - table, false /* use_threads */, table->num_rows(), {}, &result, - ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); - - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); - // Cast nanaoseconds to microseconds and use INT64 physical type ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); - std::shared_ptr
expected; MakeDateTimeTypesTable(&table, true); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); } +TEST(TestArrowReadWrite, UseDeprecatedInt96) { + using ::arrow::ArrayFromVector; + using ::arrow::field; + using ::arrow::schema; + + std::vector is_valid = {true, true, true, false, true, true}; + + auto t_s = ::arrow::timestamp(TimeUnit::SECOND); + auto t_ms = ::arrow::timestamp(TimeUnit::MILLI); + auto t_us = ::arrow::timestamp(TimeUnit::MICRO); + auto t_ns = ::arrow::timestamp(TimeUnit::NANO); + + std::vector s_values = {1489269, 1489270, 1489271, 1489272, 1489272, 1489273}; + std::vector ms_values = {1489269000, 1489270000, 1489271000, + 1489272001, 1489272000, 1489273000}; + std::vector us_values = {1489269000000, 1489270000000, 1489271000000, + 1489272000001, 1489272000000, 1489273000000}; + std::vector ns_values = {1489269000000000LL, 1489270000000000LL, + 1489271000000000LL, 1489272000000001LL, + 1489272000000000LL, 1489273000000000LL}; + + std::shared_ptr a_s, a_ms, a_us, a_ns; + ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid, s_values, &a_s); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, is_valid, ms_values, &a_ms); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, is_valid, us_values, &a_us); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, is_valid, ns_values, &a_ns); + + // Each input is typed with a unique TimeUnit + auto input_schema = schema( + {field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), field("f_ns", t_ns)}); + auto input = Table::Make( + input_schema, + {std::make_shared("f_s", a_s), std::make_shared("f_ms", a_ms), + std::make_shared("f_us", a_us), std::make_shared("f_ns", a_ns)}); + + // When reading parquet files, all int96 schema fields are converted to + // timestamp nanoseconds + auto ex_schema = schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), + field("f_ns", t_ns)}); + auto ex_result = Table::Make( + ex_schema, + {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), + std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); + + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &result, + ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); + + // Ensure enable_deprecated_int96_timestamps as precedence over + // coerce_timestamps. + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(input, false /* use_threads */, + input->num_rows(), {}, &result, + ArrowWriterProperties::Builder() + .enable_deprecated_int96_timestamps() + ->coerce_timestamps(TimeUnit::MILLI) + ->build())); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); +} + TEST(TestArrowReadWrite, CoerceTimestamps) { using ::arrow::ArrayFromVector; using ::arrow::field; @@ -1293,6 +1348,12 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { {std::make_shared("f_s", a_ms), std::make_shared("f_ms", a_ms), std::make_shared("f_us", a_ms), std::make_shared("f_ns", a_ms)}); + std::shared_ptr
milli_result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &milli_result, + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); + // Result when coercing to microseconds auto s3 = std::shared_ptr<::arrow::Schema>( new ::arrow::Schema({field("f_s", t_us), field("f_ms", t_us), field("f_us", t_us), @@ -1302,13 +1363,6 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { {std::make_shared("f_s", a_us), std::make_shared("f_ms", a_us), std::make_shared("f_us", a_us), std::make_shared("f_ns", a_us)}); - std::shared_ptr
milli_result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - input, false /* use_threads */, input->num_rows(), {}, &milli_result, - ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); - - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); - std::shared_ptr
micro_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, µ_result, @@ -1453,65 +1507,6 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_table, *result)); } -// Regression for ARROW-2802 -TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) { - using ::arrow::Column; - using ::arrow::default_memory_pool; - using ::arrow::Field; - using ::arrow::Schema; - using ::arrow::Table; - using ::arrow::TimestampBuilder; - using ::arrow::TimestampType; - using ::arrow::TimeUnit; - - auto timestamp_type = std::make_shared(TimeUnit::NANO); - - TimestampBuilder builder(timestamp_type, default_memory_pool()); - for (std::int64_t ii = 0; ii < 10; ++ii) { - ASSERT_OK(builder.Append(1000000000L * ii)); - } - std::shared_ptr values; - ASSERT_OK(builder.Finish(&values)); - - std::vector> fields; - auto field = std::make_shared("nanos", timestamp_type); - fields.emplace_back(field); - - auto schema = std::make_shared(fields); - - std::vector> columns; - auto column = std::make_shared("nanos", values); - columns.emplace_back(column); - - auto table = Table::Make(schema, columns); - - auto arrow_writer_properties = ArrowWriterProperties::Builder() - .coerce_timestamps(TimeUnit::MICRO) - ->enable_deprecated_int96_timestamps() - ->build(); - - std::shared_ptr
result; - DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result, - arrow_writer_properties); - - ASSERT_EQ(table->num_columns(), result->num_columns()); - ASSERT_EQ(table->num_rows(), result->num_rows()); - - auto actual_column = result->column(0); - auto data = actual_column->data(); - auto expected_values = - static_cast<::arrow::NumericArray*>(values.get())->raw_values(); - for (int ii = 0; ii < data->num_chunks(); ++ii) { - auto chunk = - static_cast<::arrow::NumericArray*>(data->chunk(ii).get()); - auto values = chunk->raw_values(); - for (int64_t jj = 0; jj < chunk->length(); ++jj, ++expected_values) { - // Check that the nanos have been converted to micros - ASSERT_EQ(*expected_values / 1000, values[jj]); - } - } -} - void MakeDoubleTable(int num_columns, int num_rows, int nchunks, std::shared_ptr
* out) { std::shared_ptr<::arrow::Column> column; @@ -1745,10 +1740,11 @@ TEST(TestArrowReadWrite, ListLargeRecords) { std::vector> pieces; for (int i = 0; i < num_rows; ++i) { - std::shared_ptr piece; - ASSERT_OK(col_reader->NextBatch(1, &piece)); - ASSERT_EQ(1, piece->length()); - pieces.push_back(piece); + std::shared_ptr chunked_piece; + ASSERT_OK(col_reader->NextBatch(1, &chunked_piece)); + ASSERT_EQ(1, chunked_piece->length()); + ASSERT_EQ(1, chunked_piece->num_chunks()); + pieces.push_back(chunked_piece->chunk(0)); } auto chunked = std::make_shared<::arrow::ChunkedArray>(pieces); @@ -2284,30 +2280,45 @@ TEST_P(TestNestedSchemaRead, DeepNestedSchemaRead) { INSTANTIATE_TEST_CASE_P(Repetition_type, TestNestedSchemaRead, ::testing::Values(Repetition::REQUIRED, Repetition::OPTIONAL)); -TEST(TestImpalaConversion, NanosecondToImpala) { +TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { // June 20, 2017 16:32:56 and 123456789 nanoseconds int64_t nanoseconds = INT64_C(1497976376123456789); - Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}; + Int96 calculated; + + Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}; internal::NanosecondsToImpalaTimestamp(nanoseconds, &calculated); ASSERT_EQ(expected, calculated); } -TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { - // PARQUET-995 +void TryReadDataFile(const std::string& testing_file_path, bool should_succeed = true) { std::string dir_string(test::get_data_dir()); std::stringstream ss; - ss << dir_string << "/" - << "alltypes_plain.parquet"; + ss << dir_string << "/" << testing_file_path; auto path = ss.str(); auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; - ASSERT_NO_THROW( - arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)))); - std::shared_ptr<::arrow::Table> table; - ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table)); + try { + arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false))); + std::shared_ptr<::arrow::Table> table; + ASSERT_OK(arrow_reader->ReadTable(&table)); + } catch (const ParquetException& e) { + if (should_succeed) { + FAIL() << "Exception thrown when reading file: " << e.what(); + } + } +} + +TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { + // PARQUET-995 + TryReadDataFile("alltypes_plain.parquet"); +} + +TEST(TestArrowReaderAdHoc, CorruptedSchema) { + // PARQUET-1481 + TryReadDataFile("bad_data/PARQUET-1481.parquet", false /* should_succeed */); } class TestArrowReaderAdHocSparkAndHvr diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index cb2b8508e66a5..73de8b1c456c9 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -21,6 +21,7 @@ #include "gtest/gtest.h" #include "parquet/arrow/schema.h" +#include "parquet/schema.h" #include "arrow/api.h" #include "arrow/test-util.h" diff --git a/cpp/src/parquet/arrow/reader-writer-benchmark.cc b/cpp/src/parquet/arrow/reader-writer-benchmark.cc index 775c1028bb43f..1889006573b6b 100644 --- a/cpp/src/parquet/arrow/reader-writer-benchmark.cc +++ b/cpp/src/parquet/arrow/reader-writer-benchmark.cc @@ -142,7 +142,8 @@ std::shared_ptr<::arrow::Table> TableFromVector(const std::vector static void BM_WriteColumn(::benchmark::State& state) { - std::vector values(BENCHMARK_SIZE, 128); + using T = typename ParquetType::c_type; + std::vector values(BENCHMARK_SIZE, static_cast(128)); std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable); while (state.KeepRunning()) { @@ -167,7 +168,9 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType); template static void BM_ReadColumn(::benchmark::State& state) { - std::vector values(BENCHMARK_SIZE, 128); + using T = typename ParquetType::c_type; + + std::vector values(BENCHMARK_SIZE, static_cast(128)); std::shared_ptr<::arrow::Table> table = TableFromVector(values, nullable); auto output = std::make_shared(); EXIT_NOT_OK(WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE)); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 6273fda464025..0b60c66f9a2bc 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -21,17 +21,26 @@ #include #include #include -#include -#include #include #include #include -#include "arrow/api.h" +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" +// For arrow::compute::Datum. This should perhaps be promoted. See ARROW-4022 +#include "arrow/compute/kernel.h" + #include "parquet/arrow/record_reader.h" #include "parquet/arrow/schema.h" #include "parquet/column_reader.h" @@ -46,6 +55,7 @@ using arrow::Array; using arrow::BooleanArray; +using arrow::ChunkedArray; using arrow::Column; using arrow::Field; using arrow::Int32Array; @@ -57,6 +67,9 @@ using arrow::StructArray; using arrow::Table; using arrow::TimestampArray; +// For Array/ChunkedArray variant +using arrow::compute::Datum; + using parquet::schema::Node; // Help reduce verbosity @@ -69,21 +82,24 @@ namespace parquet { namespace arrow { using ::arrow::BitUtil::BytesForBits; +using ::arrow::BitUtil::FromBigEndian; +using ::arrow::internal::SafeLeftShift; -constexpr int64_t kJulianToUnixEpochDays = 2440588LL; -constexpr int64_t kMillisecondsInADay = 86400000LL; -constexpr int64_t kNanosecondsInADay = kMillisecondsInADay * 1000LL * 1000LL; +template +using ArrayType = typename ::arrow::TypeTraits::ArrayType; -static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timestamp) { - int64_t days_since_epoch = impala_timestamp.value[2] - kJulianToUnixEpochDays; - int64_t nanoseconds = 0; +namespace { - memcpy(&nanoseconds, &impala_timestamp.value, sizeof(int64_t)); - return days_since_epoch * kNanosecondsInADay + nanoseconds; +Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) { + DCHECK_GT(chunked.num_chunks(), 0); + if (chunked.num_chunks() > 1) { + return Status::Invalid("Function call returned a chunked array"); + } + *out = chunked.chunk(0); + return Status::OK(); } -template -using ArrayType = typename ::arrow::TypeTraits::ArrayType; +} // namespace // ---------------------------------------------------------------------- // Iteration utilities @@ -223,15 +239,18 @@ class FileReader::Impl { virtual ~Impl() {} Status GetColumn(int i, std::unique_ptr* out); - Status ReadSchemaField(int i, std::shared_ptr* out); + + Status ReadSchemaField(int i, std::shared_ptr* out); Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out); + std::shared_ptr* out); + Status ReadColumn(int i, std::shared_ptr* out); + Status ReadColumnChunk(int column_index, int row_group_index, + std::shared_ptr* out); + Status GetReaderForNode(int index, const Node* node, const std::vector& indices, int16_t def_level, std::unique_ptr* out); - Status ReadColumn(int i, std::shared_ptr* out); - Status ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out); + Status GetSchema(std::shared_ptr<::arrow::Schema>* out); Status GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out); @@ -267,7 +286,8 @@ class FileReader::Impl { class ColumnReader::ColumnReaderImpl { public: virtual ~ColumnReaderImpl() {} - virtual Status NextBatch(int64_t records_to_read, std::shared_ptr* out) = 0; + virtual Status NextBatch(int64_t records_to_read, + std::shared_ptr* out) = 0; virtual Status GetDefLevels(const int16_t** data, size_t* length) = 0; virtual Status GetRepLevels(const int16_t** data, size_t* length) = 0; virtual const std::shared_ptr field() = 0; @@ -283,10 +303,10 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { NextRowGroup(); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; template - Status WrapIntoListArray(std::shared_ptr* array); + Status WrapIntoListArray(Datum* inout_array); Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; @@ -314,7 +334,7 @@ class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { InitField(node, children); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } @@ -395,7 +415,7 @@ Status FileReader::Impl::GetReaderForNode( return Status::OK(); } -Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { std::vector indices(reader_->metadata()->num_columns()); for (size_t j = 0; j < indices.size(); ++j) { @@ -406,7 +426,7 @@ Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { } Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out) { + std::shared_ptr* out) { auto parquet_schema = reader_->metadata()->schema(); auto node = parquet_schema->group_node()->field(i).get(); @@ -432,7 +452,7 @@ Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, return reader->NextBatch(records_to_read, out); } -Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { std::unique_ptr flat_column_reader; RETURN_NOT_OK(GetColumn(i, &flat_column_reader)); @@ -452,7 +472,7 @@ Status FileReader::Impl::GetSchema(const std::vector& indices, } Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out) { + std::shared_ptr* out) { auto rg_metadata = reader_->metadata()->RowGroup(row_group_index); int64_t records_to_read = rg_metadata->ColumnChunk(column_index)->num_values(); @@ -463,10 +483,7 @@ Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, new PrimitiveImpl(pool_, std::move(input))); ColumnReader flat_column_reader(std::move(impl)); - std::shared_ptr array; - RETURN_NOT_OK(flat_column_reader.NextBatch(records_to_read, &array)); - *out = array; - return Status::OK(); + return flat_column_reader.NextBatch(records_to_read, out); } Status FileReader::Impl::ReadRowGroup(int row_group_index, @@ -485,7 +502,7 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, auto ReadColumnFunc = [&indices, &row_group_index, &schema, &columns, this](int i) { int column_index = indices[i]; - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(column_index, row_group_index, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -532,7 +549,7 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, std::vector> columns(num_fields); auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -576,8 +593,6 @@ Status FileReader::Impl::ReadTable(std::shared_ptr
* table) { Status FileReader::Impl::ReadRowGroups(const std::vector& row_groups, const std::vector& indices, std::shared_ptr
* table) { - // TODO(PARQUET-1393): Modify the record readers to already read this into a single, - // continuous array. std::vector> tables(row_groups.size(), nullptr); for (size_t i = 0; i < row_groups.size(); ++i) { @@ -633,7 +648,7 @@ Status FileReader::GetSchema(const std::vector& indices, return impl_->GetSchema(indices, out); } -Status FileReader::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { try { return impl_->ReadColumn(i, out); } catch (const ::parquet::ParquetException& e) { @@ -641,7 +656,7 @@ Status FileReader::ReadColumn(int i, std::shared_ptr* out) { } } -Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { try { return impl_->ReadSchemaField(i, out); } catch (const ::parquet::ParquetException& e) { @@ -649,6 +664,18 @@ Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { } } +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadColumn(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadSchemaField(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + Status FileReader::GetRecordBatchReader(const std::vector& row_group_indices, std::shared_ptr* out) { std::vector indices(impl_->num_columns()); @@ -671,10 +698,8 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice int max_num = num_row_groups(); for (auto row_group_index : row_group_indices) { if (row_group_index < 0 || row_group_index >= max_num) { - std::ostringstream ss; - ss << "Some index in row_group_indices is " << row_group_index - << ", which is either < 0 or >= num_row_groups(" << max_num << ")"; - return Status::Invalid(ss.str()); + return Status::Invalid("Some index in row_group_indices is ", row_group_index, + ", which is either < 0 or >= num_row_groups(", max_num, ")"); } } @@ -764,7 +789,28 @@ const ParquetFileReader* FileReader::parquet_reader() const { } template -Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { +Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { + if (descr_->max_repetition_level() == 0) { + // Flat, no action + return Status::OK(); + } + + std::shared_ptr flat_array; + + // ARROW-3762(wesm): If inout_array is a chunked array, we reject as this is + // not yet implemented + if (inout_array->kind() == Datum::CHUNKED_ARRAY) { + if (inout_array->chunked_array()->num_chunks() > 1) { + return Status::NotImplemented( + "Nested data conversions not implemented for " + "chunked array outputs"); + } + flat_array = inout_array->chunked_array()->chunk(0); + } else { + DCHECK_EQ(Datum::ARRAY, inout_array->kind()); + flat_array = inout_array->make_array(); + } + const int16_t* def_levels = record_reader_->def_levels(); const int16_t* rep_levels = record_reader_->rep_levels(); const int64_t total_levels_read = record_reader_->levels_position(); @@ -775,110 +821,106 @@ Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { &arrow_schema)); std::shared_ptr current_field = arrow_schema->field(0); - if (descr_->max_repetition_level() > 0) { - // Walk downwards to extract nullability - std::vector nullable; - std::vector> offset_builders; - std::vector> valid_bits_builders; - nullable.push_back(current_field->nullable()); - while (current_field->type()->num_children() > 0) { - if (current_field->type()->num_children() > 1) { - return Status::NotImplemented( - "Fields with more than one child are not supported."); - } else { - if (current_field->type()->id() != ::arrow::Type::LIST) { - return Status::NotImplemented( - "Currently only nesting with Lists is supported."); - } - current_field = current_field->type()->child(0); + // Walk downwards to extract nullability + std::vector nullable; + std::vector> offset_builders; + std::vector> valid_bits_builders; + nullable.push_back(current_field->nullable()); + while (current_field->type()->num_children() > 0) { + if (current_field->type()->num_children() > 1) { + return Status::NotImplemented("Fields with more than one child are not supported."); + } else { + if (current_field->type()->id() != ::arrow::Type::LIST) { + return Status::NotImplemented("Currently only nesting with Lists is supported."); } - offset_builders.emplace_back( - std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); - valid_bits_builders.emplace_back( - std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); - nullable.push_back(current_field->nullable()); + current_field = current_field->type()->child(0); } + offset_builders.emplace_back( + std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); + valid_bits_builders.emplace_back( + std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); + nullable.push_back(current_field->nullable()); + } - int64_t list_depth = offset_builders.size(); - // This describes the minimal definition that describes a level that - // reflects a value in the primitive values array. - int16_t values_def_level = descr_->max_definition_level(); - if (nullable[nullable.size() - 1]) { - values_def_level--; - } + int64_t list_depth = offset_builders.size(); + // This describes the minimal definition that describes a level that + // reflects a value in the primitive values array. + int16_t values_def_level = descr_->max_definition_level(); + if (nullable[nullable.size() - 1]) { + values_def_level--; + } - // The definition levels that are needed so that a list is declared - // as empty and not null. - std::vector empty_def_level(list_depth); - int def_level = 0; - for (int i = 0; i < list_depth; i++) { - if (nullable[i]) { - def_level++; - } - empty_def_level[i] = static_cast(def_level); + // The definition levels that are needed so that a list is declared + // as empty and not null. + std::vector empty_def_level(list_depth); + int def_level = 0; + for (int i = 0; i < list_depth; i++) { + if (nullable[i]) { def_level++; } + empty_def_level[i] = static_cast(def_level); + def_level++; + } - int32_t values_offset = 0; - std::vector null_counts(list_depth, 0); - for (int64_t i = 0; i < total_levels_read; i++) { - int16_t rep_level = rep_levels[i]; - if (rep_level < descr_->max_repetition_level()) { - for (int64_t j = rep_level; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + int32_t values_offset = 0; + std::vector null_counts(list_depth, 0); + for (int64_t i = 0; i < total_levels_read; i++) { + int16_t rep_level = rep_levels[i]; + if (rep_level < descr_->max_repetition_level()) { + for (int64_t j = rep_level; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); + } - if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { - RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); - null_counts[j]++; + if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { + RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); + null_counts[j]++; + break; + } else { + RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); + if (empty_def_level[j] == def_levels[i]) { break; - } else { - RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); - if (empty_def_level[j] == def_levels[i]) { - break; - } } } } - if (def_levels[i] >= values_def_level) { - values_offset++; - } } - // Add the final offset to all lists - for (int64_t j = 0; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + if (def_levels[i] >= values_def_level) { + values_offset++; } - - std::vector> offsets; - std::vector> valid_bits; - std::vector list_lengths; - for (int64_t j = 0; j < list_depth; j++) { - list_lengths.push_back(offset_builders[j]->length() - 1); - std::shared_ptr array; - RETURN_NOT_OK(offset_builders[j]->Finish(&array)); - offsets.emplace_back(std::static_pointer_cast(array)->values()); - RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); - valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + // Add the final offset to all lists + for (int64_t j = 0; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); } + } - std::shared_ptr output(*array); - for (int64_t j = list_depth - 1; j >= 0; j--) { - auto list_type = - ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); - output = std::make_shared<::arrow::ListArray>( - list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]); - } - *array = output; + std::vector> offsets; + std::vector> valid_bits; + std::vector list_lengths; + for (int64_t j = 0; j < list_depth; j++) { + list_lengths.push_back(offset_builders[j]->length() - 1); + std::shared_ptr array; + RETURN_NOT_OK(offset_builders[j]->Finish(&array)); + offsets.emplace_back(std::static_pointer_cast(array)->values()); + RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); + valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + + std::shared_ptr output = flat_array; + for (int64_t j = list_depth - 1; j >= 0; j--) { + auto list_type = + ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); + output = std::make_shared<::arrow::ListArray>(list_type, list_lengths[j], offsets[j], + output, valid_bits[j], null_counts[j]); } + *inout_array = output; return Status::OK(); } @@ -909,8 +951,7 @@ struct TransferFunctor { using ParquetCType = typename ParquetType::c_type; Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { static_assert(!std::is_same::value, "The fast path transfer functor should be used " "for primitive values"); @@ -938,8 +979,7 @@ template struct TransferFunctor> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr values = reader->ReleaseValues(); @@ -957,8 +997,7 @@ struct TransferFunctor struct TransferFunctor<::arrow::BooleanType, BooleanType> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr data; @@ -991,8 +1030,7 @@ struct TransferFunctor<::arrow::BooleanType, BooleanType> { template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1001,7 +1039,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { auto data_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *data_ptr++ = impala_timestamp_to_nanoseconds(values[i]); + *data_ptr++ = Int96GetNanoSeconds(values[i]); } if (reader->nullable_values()) { @@ -1019,8 +1057,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { template <> struct TransferFunctor<::arrow::Date64Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1029,7 +1066,7 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { auto out_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *out_ptr++ = static_cast(values[i]) * kMillisecondsInADay; + *out_ptr++ = static_cast(values[i]) * kMillisecondsPerDay; } if (reader->nullable_values()) { @@ -1046,26 +1083,29 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { template struct TransferFunctor< ArrowType, ParquetType, - typename std::enable_if::value || - std::is_same::value>::type> { + typename std::enable_if< + (std::is_base_of<::arrow::BinaryType, ArrowType>::value || + std::is_same<::arrow::FixedSizeBinaryType, ArrowType>::value) && + (std::is_same::value || + std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - RETURN_NOT_OK(reader->builder()->Finish(out)); + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + std::vector> chunks = reader->GetBuilderChunks(); if (type->id() == ::arrow::Type::STRING) { // Convert from BINARY type to STRING - auto new_data = (*out)->data()->Copy(); - new_data->type = type; - *out = ::arrow::MakeArray(new_data); + for (size_t i = 0; i < chunks.size(); ++i) { + auto new_data = chunks[i]->data()->Copy(); + new_data->type = type; + chunks[i] = ::arrow::MakeArray(new_data); + } } + *out = std::make_shared(chunks); return Status::OK(); } }; static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop) { - using ::arrow::BitUtil::FromBigEndian; - const int32_t length = stop - start; DCHECK_GE(length, 0); @@ -1121,37 +1161,54 @@ static constexpr int32_t kMaxDecimalBytes = 16; /// \brief Convert a sequence of big-endian bytes to one int64_t (high bits) and one /// uint64_t (low bits). -static void BytesToIntegerPair(const uint8_t* bytes, - const int32_t total_number_of_bytes_used, int64_t* high, - uint64_t* low) { - DCHECK_GE(total_number_of_bytes_used, kMinDecimalBytes); - DCHECK_LE(total_number_of_bytes_used, kMaxDecimalBytes); - - /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the - /// sign bit. - const bool is_negative = static_cast(bytes[0]) < 0; +static void BytesToIntegerPair(const uint8_t* bytes, const int32_t length, + int64_t* out_high, uint64_t* out_low) { + DCHECK_GE(length, kMinDecimalBytes); + DCHECK_LE(length, kMaxDecimalBytes); - /// Sign extend the low bits if necessary - *low = UINT64_MAX * (is_negative && total_number_of_bytes_used < 8); - *high = -1 * (is_negative && total_number_of_bytes_used < kMaxDecimalBytes); + // XXX This code is copied from Decimal::FromBigEndian - /// Stop byte of the high bytes - const int32_t high_bits_offset = std::max(0, total_number_of_bytes_used - 8); + int64_t high, low; - /// Shift left enough bits to make room for the incoming int64_t - *high <<= high_bits_offset * CHAR_BIT; + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. + const bool is_negative = static_cast(bytes[0]) < 0; - /// Preserve the upper bits by inplace OR-ing the int64_t - *high |= BytesToInteger(bytes, 0, high_bits_offset); + // 1. Extract the high bytes + // Stop byte of the high bytes + const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = BytesToInteger(bytes, 0, high_bits_offset); - /// Stop byte of the low bytes - const int32_t low_bits_offset = std::min(total_number_of_bytes_used, 8); + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } + + // 2. Extract the low bytes + // Stop byte of the low bytes + const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = BytesToInteger(bytes, high_bits_offset, length); - /// Shift left enough bits to make room for the incoming uint64_t - *low <<= low_bits_offset * CHAR_BIT; + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } - /// Preserve the upper bits by inplace OR-ing the uint64_t - *low |= BytesToInteger(bytes, high_bits_offset, total_number_of_bytes_used); + *out_high = high; + *out_low = static_cast(low); } static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width, @@ -1166,121 +1223,133 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid BytesToIntegerPair(value, byte_width, high, low); } -/// \brief Convert an array of FixedLenByteArrays to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating a arrow::FixedSizeBinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in the FixedSizeBinaryArray to two integers -/// representing the high and low bits of each decimal value. +// ---------------------------------------------------------------------- +// BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 + +template +Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>&, + MemoryPool* pool, std::shared_ptr*) { + return Status::NotImplemented("not implemented"); +} + template <> -struct TransferFunctor<::arrow::Decimal128Type, FLBAType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& fixed_size_binary_array = + static_cast(array); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& fixed_size_binary_array = - static_cast(*array); + // The byte width of each decimal value + const int32_t type_length = + static_cast(*type).byte_width(); - // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time - // this will be different from the decimal array width because we write the minimum - // number of bytes necessary to represent a given precision - const int32_t byte_width = - static_cast(*fixed_size_binary_array.type()) - .byte_width(); + // number of elements in the entire array + const int64_t length = fixed_size_binary_array.length(); - // The byte width of each decimal value - const int32_t type_length = - static_cast(*type).byte_width(); + // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time + // this will be different from the decimal array width because we write the minimum + // number of bytes necessary to represent a given precision + const int32_t byte_width = + static_cast(*fixed_size_binary_array.type()) + .byte_width(); - // number of elements in the entire array - const int64_t length = fixed_size_binary_array.length(); + // allocate memory for the decimal array + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // allocate memory for the decimal array - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); - - // convert each FixedSizeBinary value to valid decimal bytes - const int64_t null_count = fixed_size_binary_array.null_count(); - if (null_count > 0) { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { - if (!fixed_size_binary_array.IsNull(i)) { - RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, - out_ptr); - } - } - } else { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); + + // convert each FixedSizeBinary value to valid decimal bytes + const int64_t null_count = fixed_size_binary_array.null_count(); + if (null_count > 0) { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + if (!fixed_size_binary_array.IsNull(i)) { RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); } } - - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - return Status::OK(); + } else { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); + } } -}; -/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating an arrow::BinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers -/// representing the high and low bits of each decimal value. -template <> -struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& binary_array = static_cast(*array); + return Status::OK(); +} - const int64_t length = binary_array.length(); +template <> +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& binary_array = static_cast(array); + const int64_t length = binary_array.length(); - const auto& decimal_type = static_cast(*type); - const int64_t type_length = decimal_type.byte_width(); + const auto& decimal_type = static_cast(*type); + const int64_t type_length = decimal_type.byte_width(); - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); - const int64_t null_count = binary_array.null_count(); + const int64_t null_count = binary_array.null_count(); - // convert each BinaryArray value to valid decimal bytes - for (int64_t i = 0; i < length; i++, out_ptr += type_length) { - int32_t record_len = 0; - const uint8_t* record_loc = binary_array.GetValue(i, &record_len); + // convert each BinaryArray value to valid decimal bytes + for (int64_t i = 0; i < length; i++, out_ptr += type_length) { + int32_t record_len = 0; + const uint8_t* record_loc = binary_array.GetValue(i, &record_len); - if ((record_len < 0) || (record_len > type_length)) { - return Status::Invalid("Invalid BYTE_ARRAY size"); - } + if ((record_len < 0) || (record_len > type_length)) { + return Status::Invalid("Invalid BYTE_ARRAY size"); + } - auto out_ptr_view = reinterpret_cast(out_ptr); - out_ptr_view[0] = 0; - out_ptr_view[1] = 0; + auto out_ptr_view = reinterpret_cast(out_ptr); + out_ptr_view[0] = 0; + out_ptr_view[1] = 0; - // only convert rows that are not null if there are nulls, or - // all rows, if there are not - if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { - RawBytesToDecimalBytes(record_loc, record_len, out_ptr); - } + // only convert rows that are not null if there are nulls, or + // all rows, if there are not + if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { + RawBytesToDecimalBytes(record_loc, record_len, out_ptr); } + } + + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, binary_array.null_bitmap(), null_count); + return Status::OK(); +} + +/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array +/// We do this by: +/// 1. Creating an arrow::BinaryArray from the RecordReader's builder +/// 2. Allocating a buffer for the arrow::Decimal128Array +/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers +/// representing the high and low bits of each decimal value. +template +struct TransferFunctor< + ArrowType, ParquetType, + typename std::enable_if::value && + (std::is_same::value || + std::is_same::value)>::type> { + Status operator()(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, binary_array.null_bitmap(), null_count); + ::arrow::ArrayVector chunks = reader->GetBuilderChunks(); + for (size_t i = 0; i < chunks.size(); ++i) { + std::shared_ptr chunk_as_decimal; + RETURN_NOT_OK( + ConvertToDecimal128(*chunks[i], type, pool, &chunk_as_decimal)); + + // Replace the chunk, which will hopefully also free memory as we go + chunks[i] = chunk_as_decimal; + } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1295,7 +1364,7 @@ template ::value>::type> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); const int64_t length = reader->values_written(); @@ -1342,8 +1411,7 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, template <> struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; @@ -1351,23 +1419,23 @@ struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { template <> struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; -#define TRANSFER_DATA(ArrowType, ParquetType) \ - TransferFunctor func; \ - RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), out)); \ - RETURN_NOT_OK(WrapIntoListArray(out)) +#define TRANSFER_DATA(ArrowType, ParquetType) \ + TransferFunctor func; \ + RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ + RETURN_NOT_OK(WrapIntoListArray(&result)) #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ TRANSFER_DATA(ArrowType, ParquetType); \ } break; -Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status PrimitiveImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { try { // Pre-allocation gives much better performance for flat columns record_reader_->Reserve(records_to_read); @@ -1387,6 +1455,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* return ::arrow::Status::IOError(e.what()); } + Datum result; switch (field_->type()->id()) { TRANSFER_CASE(BOOL, ::arrow::BooleanType, BooleanType) TRANSFER_CASE(UINT8, ::arrow::UInt8Type, Int32Type) @@ -1405,8 +1474,8 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type) TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType) case ::arrow::Type::NA: { - *out = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); - RETURN_NOT_OK(WrapIntoListArray(out)); + result = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); + RETURN_NOT_OK(WrapIntoListArray(&result)); break; } case ::arrow::Type::DECIMAL: { @@ -1447,11 +1516,19 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* TRANSFER_CASE(TIME32, ::arrow::Time32Type, Int32Type) TRANSFER_CASE(TIME64, ::arrow::Time64Type, Int64Type) default: - std::stringstream ss; - ss << "No support for reading columns of type " << field_->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("No support for reading columns of type ", + field_->type()->ToString()); } + DCHECK_NE(result.kind(), Datum::NONE); + + if (result.kind() == Datum::ARRAY) { + *out = std::make_shared(result.make_array()); + } else if (result.kind() == Datum::CHUNKED_ARRAY) { + *out = result.chunked_array(); + } else { + DCHECK(false) << "Should be impossible"; + } return Status::OK(); } @@ -1477,10 +1554,17 @@ ColumnReader::ColumnReader(std::unique_ptr impl) ColumnReader::~ColumnReader() {} -Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status ColumnReader::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { return impl_->NextBatch(records_to_read, out); } +Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(impl_->NextBatch(records_to_read, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + // StructImpl methods Status StructImpl::DefLevelsToNullArray(std::shared_ptr* null_bitmap_out, @@ -1565,17 +1649,21 @@ Status StructImpl::GetRepLevels(const int16_t** data, size_t* length) { return Status::NotImplemented("GetRepLevels is not implemented for struct"); } -Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status StructImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { std::vector> children_arrays; std::shared_ptr null_bitmap; int64_t null_count; // Gather children arrays and def levels for (auto& child : children_) { - std::shared_ptr child_array; + std::shared_ptr field; + RETURN_NOT_OK(child->NextBatch(records_to_read, &field)); - RETURN_NOT_OK(child->NextBatch(records_to_read, &child_array)); - children_arrays.push_back(child_array); + if (field->num_chunks() > 1) { + return Status::Invalid("Chunked field reads not yet supported with StructArray"); + } + children_arrays.push_back(field->chunk(0)); } RETURN_NOT_OK(DefLevelsToNullArray(&null_bitmap, &null_count)); @@ -1589,8 +1677,9 @@ Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* ou } } - *out = std::make_shared(field()->type(), struct_length, children_arrays, - null_bitmap, null_count); + auto result = std::make_shared(field()->type(), struct_length, + children_arrays, null_bitmap, null_count); + *out = std::make_shared(result); return Status::OK(); } @@ -1613,10 +1702,16 @@ RowGroupReader::~RowGroupReader() {} RowGroupReader::RowGroupReader(FileReader::Impl* impl, int row_group_index) : impl_(impl), row_group_index_(row_group_index) {} -Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::ChunkedArray>* out) { return impl_->ReadColumnChunk(column_index_, row_group_index_, out); } +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(impl_->ReadColumnChunk(column_index_, row_group_index_, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + ColumnChunkReader::~ColumnChunkReader() {} ColumnChunkReader::ColumnChunkReader(FileReader::Impl* impl, int row_group_index, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cd94ca28fdcb..5286e742b08c1 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -30,6 +30,7 @@ namespace arrow { class Array; +class ChunkedArray; class MemoryPool; class RecordBatchReader; class Schema; @@ -125,6 +126,10 @@ class PARQUET_EXPORT FileReader { std::shared_ptr<::arrow::Schema>* out); // Read column as a whole into an Array. + ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); // NOTE: Experimental API @@ -139,27 +144,11 @@ class PARQUET_EXPORT FileReader { // 2 foo3 // // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - // NOTE: Experimental API - // Reads a specific top level schema field into an Array, while keeping only chosen - // leaf columns. - // The index i refers the index of the top level schema field, which may - // be nested or flat, and indices vector refers to the leaf column indices - e.g. - // - // i indices - // 0 0 foo.bar - // 0 1 foo.bar.baz - // 0 2 foo.qux - // 1 3 foo2 - // 2 4 foo3 - // - // i=0 indices={0,2} will read a partial struct with foo.bar and foo.quox columns - // i=1 indices={3} will read foo2 column - // i=1 indices={2} will result in out=nullptr - // leaf indices which are unrelated to the schema field are ignored - ::arrow::Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr<::arrow::Array>* out); + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the /// ordering in row_group_indices matters. @@ -248,6 +237,10 @@ class PARQUET_EXPORT RowGroupReader { class PARQUET_EXPORT ColumnChunkReader { public: + ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); virtual ~ColumnChunkReader(); @@ -281,6 +274,11 @@ class PARQUET_EXPORT ColumnReader { // // Returns Status::OK on a successful read, including if you have exhausted // the data available in the file. + ::arrow::Status NextBatch(int64_t batch_size, + std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); private: diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 4a3cd526b118a..39945afc78298 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -1,4 +1,4 @@ -// licensed to the Apache Software Foundation (ASF) under one +// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file @@ -22,25 +22,20 @@ #include #include #include -#include #include #include +#include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" -#include "arrow/memory_pool.h" -#include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/logging.h" -#include "arrow/util/rle-encoding.h" #include "parquet/column_page.h" #include "parquet/column_reader.h" -#include "parquet/encoding-internal.h" #include "parquet/encoding.h" #include "parquet/exception.h" -#include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/types.h" @@ -51,9 +46,6 @@ namespace internal { namespace BitUtil = ::arrow::BitUtil; -template -class TypedRecordReader; - // PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index // encoding. static bool IsDictionaryIndexEncoding(const Encoding::type& e) { @@ -80,20 +72,15 @@ class RecordReader::RecordReaderImpl { null_count_(0), levels_written_(0), levels_position_(0), - levels_capacity_(0) { + levels_capacity_(0), + uses_values_(!(descr->physical_type() == Type::BYTE_ARRAY)) { nullable_values_ = internal::HasSpacedValues(descr); - values_ = AllocateBuffer(pool); + if (uses_values_) { + values_ = AllocateBuffer(pool); + } valid_bits_ = AllocateBuffer(pool); def_levels_ = AllocateBuffer(pool); rep_levels_ = AllocateBuffer(pool); - - if (descr->physical_type() == Type::BYTE_ARRAY) { - builder_.reset(new ::arrow::BinaryBuilder(pool)); - } else if (descr->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { - int byte_width = descr->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool)); - } Reset(); } @@ -218,9 +205,13 @@ class RecordReader::RecordReaderImpl { bool nullable_values() const { return nullable_values_; } std::shared_ptr ReleaseValues() { - auto result = values_; - values_ = AllocateBuffer(pool_); - return result; + if (uses_values_) { + auto result = values_; + values_ = AllocateBuffer(pool_); + return result; + } else { + return nullptr; + } } std::shared_ptr ReleaseIsValid() { @@ -229,8 +220,6 @@ class RecordReader::RecordReaderImpl { return result; } - ::arrow::ArrayBuilder* builder() { return builder_.get(); } - // Process written repetition/definition levels to reach the end of // records. Process no more levels than necessary to delimit the indicated // number of logical records. Updates internal state of RecordReader @@ -334,7 +323,13 @@ class RecordReader::RecordReaderImpl { } int type_size = GetTypeByteSize(descr_->physical_type()); - PARQUET_THROW_NOT_OK(values_->Resize(new_values_capacity * type_size, false)); + + // XXX(wesm): A hack to avoid memory allocation when reading directly + // into builder classes + if (uses_values_) { + PARQUET_THROW_NOT_OK(values_->Resize(new_values_capacity * type_size, false)); + } + values_capacity_ = new_values_capacity; } if (nullable_values_) { @@ -375,13 +370,15 @@ class RecordReader::RecordReaderImpl { records_read_ = 0; - // Calling Finish on the builders also resets them + // Call Finish on the binary builders to reset them } void ResetValues() { if (values_written_ > 0) { // Resize to 0, but do not shrink to fit - PARQUET_THROW_NOT_OK(values_->Resize(0, false)); + if (uses_values_) { + PARQUET_THROW_NOT_OK(values_->Resize(0, false)); + } PARQUET_THROW_NOT_OK(valid_bits_->Resize(0, false)); values_written_ = 0; values_capacity_ = 0; @@ -391,6 +388,8 @@ class RecordReader::RecordReaderImpl { virtual void DebugPrintState() = 0; + virtual std::vector> GetBuilderChunks() = 0; + protected: virtual bool ReadNewPage() = 0; @@ -434,10 +433,10 @@ class RecordReader::RecordReaderImpl { int64_t levels_position_; int64_t levels_capacity_; - // TODO(wesm): ByteArray / FixedLenByteArray types - std::unique_ptr<::arrow::ArrayBuilder> builder_; - std::shared_ptr<::arrow::ResizableBuffer> values_; + // In the case of false, don't allocate the values buffer (when we directly read into + // builder classes). + bool uses_values_; template T* ValuesHead() { @@ -449,13 +448,32 @@ class RecordReader::RecordReaderImpl { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; +template +struct RecordReaderTraits { + using BuilderType = ::arrow::ArrayBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::internal::ChunkedBinaryBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::FixedSizeBinaryBuilder; +}; + template class TypedRecordReader : public RecordReader::RecordReaderImpl { public: - typedef typename DType::c_type T; + using T = typename DType::c_type; + + using BuilderType = typename RecordReaderTraits::BuilderType; - TypedRecordReader(const ColumnDescriptor* schema, ::arrow::MemoryPool* pool) - : RecordReader::RecordReaderImpl(schema, pool), current_decoder_(nullptr) {} + TypedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) { + InitializeBuilder(); + } void ResetDecoders() override { decoders_.clear(); } @@ -546,19 +564,27 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { std::cout << std::endl; } + std::vector> GetBuilderChunks() override { + throw ParquetException("GetChunks only implemented for binary types"); + } + private: - typedef Decoder DecoderType; + using DecoderType = typename EncodingTraits::Decoder; // Map of encoding type to the respective decoder object. For example, a // column chunk's data pages may include both dictionary-encoded and // plain-encoded data. - std::unordered_map> decoders_; + std::unordered_map> decoders_; + + std::unique_ptr builder_; DecoderType* current_decoder_; // Advance to the next data page bool ReadNewPage() override; + void InitializeBuilder() {} + void ConfigureDictionary(const DictionaryPage* page); }; @@ -572,18 +598,41 @@ void TypedRecordReader::DebugPrintState() {} template <> void TypedRecordReader::DebugPrintState() {} +template <> +void TypedRecordReader::InitializeBuilder() { + // Maximum of 16MB chunks + constexpr int32_t kBinaryChunksize = 1 << 24; + DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); + builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool_)); +} + +template <> +void TypedRecordReader::InitializeBuilder() { + DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); + int byte_width = descr_->type_length(); + std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); + builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool_)); +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { + ::arrow::ArrayVector chunks; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); + return chunks; +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); + return ::arrow::ArrayVector({chunk}); +} + template <> inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { - auto values = ValuesHead(); - int64_t num_decoded = - current_decoder_->Decode(values, static_cast(values_to_read)); + int64_t num_decoded = current_decoder_->DecodeArrowNonNull( + static_cast(values_to_read), builder_.get()); DCHECK_EQ(num_decoded, values_to_read); - - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); - for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); - } ResetValues(); } @@ -594,9 +643,8 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } ResetValues(); } @@ -604,25 +652,10 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) template <> inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read, int64_t null_count) { - uint8_t* valid_bits = valid_bits_->mutable_data(); - const int64_t valid_bits_offset = values_written_; - auto values = ValuesHead(); - - int64_t num_decoded = current_decoder_->DecodeSpaced( - values, static_cast(values_to_read), static_cast(null_count), valid_bits, - valid_bits_offset); + int64_t num_decoded = current_decoder_->DecodeArrow( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, builder_.get()); DCHECK_EQ(num_decoded, values_to_read); - - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); - - for (int64_t i = 0; i < num_decoded; i++) { - if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); - } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); - } - } ResetValues(); } @@ -638,12 +671,11 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -664,8 +696,8 @@ inline void TypedRecordReader::ConfigureDictionary(const DictionaryPage* if (page->encoding() == Encoding::PLAIN_DICTIONARY || page->encoding() == Encoding::PLAIN) { - PlainDecoder dictionary(descr_); - dictionary.SetData(page->num_values(), page->data(), page->size()); + auto dictionary = MakeTypedDecoder(Encoding::PLAIN, descr_); + dictionary->SetData(page->num_values(), page->data(), page->size()); // The dictionary is fully decoded during DictionaryDecoder::Init, so the // DictionaryPage buffer is no longer required after this step @@ -673,14 +705,16 @@ inline void TypedRecordReader::ConfigureDictionary(const DictionaryPage* // TODO(wesm): investigate whether this all-or-nothing decoding of the // dictionary makes sense and whether performance can be improved - auto decoder = std::make_shared>(descr_, pool_); - decoder->SetDict(&dictionary); - decoders_[encoding] = decoder; + std::unique_ptr> decoder = MakeDictDecoder(descr_, pool_); + decoder->SetDict(dictionary.get()); + decoders_[encoding] = + std::unique_ptr(dynamic_cast(decoder.release())); } else { ParquetException::NYI("only plain dictionary encoding has been implemented"); } current_decoder_ = decoders_[encoding].get(); + DCHECK(current_decoder_); } template @@ -746,6 +780,7 @@ bool TypedRecordReader::ReadNewPage() { auto it = decoders_.find(static_cast(encoding)); if (it != decoders_.end()) { + DCHECK(it->second.get() != nullptr); if (encoding == Encoding::RLE_DICTIONARY) { DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY); } @@ -753,9 +788,9 @@ bool TypedRecordReader::ReadNewPage() { } else { switch (encoding) { case Encoding::PLAIN: { - std::shared_ptr decoder(new PlainDecoder(descr_)); - decoders_[static_cast(encoding)] = decoder; + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_); current_decoder_ = decoder.get(); + decoders_[static_cast(encoding)] = std::move(decoder); break; } case Encoding::RLE_DICTIONARY: @@ -809,8 +844,12 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, case Type::FIXED_LEN_BYTE_ARRAY: return std::shared_ptr( new RecordReader(new TypedRecordReader(descr, pool))); - default: - DCHECK(false); + default: { + // PARQUET-1481: This can occur if the file is corrupt + std::stringstream ss; + ss << "Invalid physical column type: " << static_cast(descr->physical_type()); + throw ParquetException(ss.str()); + } } // Unreachable code, but supress compiler warning return nullptr; @@ -845,8 +884,6 @@ std::shared_ptr RecordReader::ReleaseIsValid() { return impl_->ReleaseIsValid(); } -::arrow::ArrayBuilder* RecordReader::builder() { return impl_->builder(); } - int64_t RecordReader::values_written() const { return impl_->values_written(); } int64_t RecordReader::levels_position() const { return impl_->levels_position(); } @@ -863,6 +900,10 @@ void RecordReader::SetPageReader(std::unique_ptr reader) { impl_->SetPageReader(std::move(reader)); } +::arrow::ArrayVector RecordReader::GetBuilderChunks() { + return impl_->GetBuilderChunks(); +} + void RecordReader::DebugPrintState() { impl_->DebugPrintState(); } } // namespace internal diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index 7efd0d54899fe..cc932c2865028 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -20,15 +20,15 @@ #include #include +#include #include "arrow/memory_pool.h" -#include "parquet/util/macros.h" #include "parquet/util/memory.h" namespace arrow { -class ArrayBuilder; +class Array; } // namespace arrow @@ -77,7 +77,6 @@ class RecordReader { std::shared_ptr ReleaseValues(); std::shared_ptr ReleaseIsValid(); - ::arrow::ArrayBuilder* builder(); /// \brief Number of values written including nulls (if any) int64_t values_written() const; @@ -106,6 +105,9 @@ class RecordReader { void DebugPrintState(); + // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output + std::vector> GetBuilderChunks(); + private: std::unique_ptr impl_; explicit RecordReader(RecordReaderImpl* impl); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index d0014a6f3aa2a..f1ebad0e5667f 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -19,14 +19,20 @@ #include #include +#include #include -#include "parquet/api/schema.h" -#include "parquet/util/schema-util.h" - -#include "arrow/api.h" +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/type.h" #include "arrow/util/logging.h" +#include "parquet/arrow/writer.h" +#include "parquet/exception.h" +#include "parquet/properties.h" +#include "parquet/types.h" +#include "parquet/util/schema-util.h" + using arrow::Field; using arrow::Status; @@ -80,10 +86,9 @@ static Status FromFLBA(const PrimitiveNode& node, std::shared_ptr* ou *out = MakeDecimal128Type(node); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for fixed-length binary array"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for fixed-length binary array"); } return Status::OK(); @@ -122,10 +127,9 @@ static Status FromInt32(const PrimitiveNode& node, std::shared_ptr* o *out = MakeDecimal128Type(node); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for INT32"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for INT32"); } return Status::OK(); } @@ -154,10 +158,9 @@ static Status FromInt64(const PrimitiveNode& node, std::shared_ptr* o *out = ::arrow::time64(::arrow::TimeUnit::MICRO); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for INT64"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for INT64"); } return Status::OK(); } @@ -423,45 +426,66 @@ Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, return Status::OK(); } +static LogicalType::type LogicalTypeFromArrowTimeUnit(::arrow::TimeUnit::type time_unit) { + switch (time_unit) { + case ::arrow::TimeUnit::MILLI: + return LogicalType::TIMESTAMP_MILLIS; + case ::arrow::TimeUnit::MICRO: + return LogicalType::TIMESTAMP_MICROS; + case ::arrow::TimeUnit::SECOND: + case ::arrow::TimeUnit::NANO: + // No equivalent parquet logical type. + break; + } + + return LogicalType::NONE; +} + static Status GetTimestampMetadata(const ::arrow::TimestampType& type, const ArrowWriterProperties& properties, ParquetType::type* physical_type, LogicalType::type* logical_type) { - auto unit = type.unit(); - *physical_type = ParquetType::INT64; + const bool coerce = properties.coerce_timestamps_enabled(); + const auto unit = coerce ? properties.coerce_timestamps_unit() : type.unit(); - if (properties.coerce_timestamps_enabled()) { - auto coerce_unit = properties.coerce_timestamps_unit(); - if (coerce_unit == ::arrow::TimeUnit::MILLI) { - *logical_type = LogicalType::TIMESTAMP_MILLIS; - } else if (coerce_unit == ::arrow::TimeUnit::MICRO) { - *logical_type = LogicalType::TIMESTAMP_MICROS; - } else { - return Status::NotImplemented( - "Can only coerce Arrow timestamps to milliseconds" - " or microseconds"); + // The user is explicitly asking for Impala int96 encoding, there is no + // logical type. + if (properties.support_deprecated_int96_timestamps()) { + *physical_type = ParquetType::INT96; + return Status::OK(); + } + + *physical_type = ParquetType::INT64; + *logical_type = LogicalTypeFromArrowTimeUnit(unit); + + // The user is requesting that all timestamp columns are casted to a specific + // type. Only 2 TimeUnit are supported by arrow-parquet. + if (coerce) { + switch (unit) { + case ::arrow::TimeUnit::MILLI: + case ::arrow::TimeUnit::MICRO: + break; + case ::arrow::TimeUnit::NANO: + case ::arrow::TimeUnit::SECOND: + return Status::NotImplemented( + "Can only coerce Arrow timestamps to milliseconds" + " or microseconds"); } + return Status::OK(); } - if (unit == ::arrow::TimeUnit::MILLI) { - *logical_type = LogicalType::TIMESTAMP_MILLIS; - } else if (unit == ::arrow::TimeUnit::MICRO) { + // Until ARROW-3729 is resolved, nanoseconds are explicitly converted to + // int64 microseconds when deprecated int96 is not requested. + if (type.unit() == ::arrow::TimeUnit::NANO) *logical_type = LogicalType::TIMESTAMP_MICROS; - } else if (unit == ::arrow::TimeUnit::NANO) { - if (properties.support_deprecated_int96_timestamps()) { - *physical_type = ParquetType::INT96; - // No corresponding logical type - } else { - *logical_type = LogicalType::TIMESTAMP_MICROS; - } - } else { + else if (type.unit() == ::arrow::TimeUnit::SECOND) return Status::NotImplemented( "Only MILLI, MICRO, and NANOS units supported for Arrow timestamps with " "Parquet."); - } + return Status::OK(); -} +} // namespace arrow Status FieldToNode(const std::shared_ptr& field, const WriterProperties& properties, @@ -592,10 +616,9 @@ Status FieldToNode(const std::shared_ptr& field, } default: { // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR - std::stringstream ss; - ss << "Unhandled type for Arrow to Parquet schema conversion: "; - ss << field->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "Unhandled type for Arrow to Parquet schema conversion: ", + field->type()->ToString()); } } PARQUET_CATCH_NOT_OK(*out = @@ -698,7 +721,7 @@ int32_t DecimalSize(int32_t precision) { } DCHECK(false); return -1; -} +} // namespace arrow } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h index 649fe86120a18..0e65ed844eb58 100644 --- a/cpp/src/parquet/arrow/schema.h +++ b/cpp/src/parquet/arrow/schema.h @@ -22,15 +22,14 @@ #include #include -#include "arrow/api.h" - -#include "parquet/arrow/writer.h" #include "parquet/metadata.h" #include "parquet/schema.h" #include "parquet/util/visibility.h" namespace arrow { +class Field; +class Schema; class Status; } // namespace arrow diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index ef5de07d87f16..6813880f3b0e3 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -18,17 +18,29 @@ #include "parquet/arrow/writer.h" #include -#include +#include +#include #include #include -#include "arrow/api.h" +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" #include "arrow/compute/api.h" +#include "arrow/status.h" +#include "arrow/table.h" #include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" #include "arrow/visitor_inline.h" #include "arrow/util/logging.h" + #include "parquet/arrow/schema.h" +#include "parquet/column_writer.h" +#include "parquet/exception.h" +#include "parquet/file_writer.h" +#include "parquet/schema.h" +#include "parquet/util/memory.h" using arrow::Array; using arrow::BinaryArray; @@ -312,6 +324,10 @@ class ArrowColumnWriter { Status Write(const Array& data); Status Write(const ChunkedArray& data, int64_t offset, const int64_t size) { + if (data.length() == 0) { + return Status::OK(); + } + int64_t absolute_position = 0; int chunk_index = 0; int64_t chunk_offset = 0; @@ -386,7 +402,11 @@ class ArrowColumnWriter { Status WriteBatch(int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const typename ParquetType::c_type* values) { - auto typed_writer = static_cast*>(writer_); + auto typed_writer = + ::arrow::internal::checked_cast*>(writer_); + // WriteBatch was called with type mismatching the writer_'s type. This + // could be a schema conversion problem. + DCHECK(typed_writer); PARQUET_CATCH_NOT_OK( typed_writer->WriteBatch(num_levels, def_levels, rep_levels, values)); return Status::OK(); @@ -397,7 +417,11 @@ class ArrowColumnWriter { const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const typename ParquetType::c_type* values) { - auto typed_writer = static_cast*>(writer_); + auto typed_writer = + ::arrow::internal::checked_cast*>(writer_); + // WriteBatchSpaced was called with type mismatching the writer_'s type. This + // could be a schema conversion problem. + DCHECK(typed_writer); PARQUET_CATCH_NOT_OK(typed_writer->WriteBatchSpaced( num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, values)); return Status::OK(); @@ -570,20 +594,42 @@ NULLABLE_BATCH_FAST_PATH(DoubleType, ::arrow::DoubleType, double) NULLABLE_BATCH_FAST_PATH(Int64Type, ::arrow::TimestampType, int64_t) NONNULLABLE_BATCH_FAST_PATH(Int64Type, ::arrow::TimestampType, int64_t) +#define CONV_CASE_LOOP(ConversionFunction) \ + for (int64_t i = 0; i < num_values; i++) \ + ConversionFunction(arrow_values[i], &output[i]); + +static void ConvertArrowTimestampToParquetInt96(const int64_t* arrow_values, + int64_t num_values, + ::arrow::TimeUnit ::type unit_type, + Int96* output) { + switch (unit_type) { + case TimeUnit::NANO: + CONV_CASE_LOOP(internal::NanosecondsToImpalaTimestamp); + break; + case TimeUnit::MICRO: + CONV_CASE_LOOP(internal::MicrosecondsToImpalaTimestamp); + break; + case TimeUnit::MILLI: + CONV_CASE_LOOP(internal::MillisecondsToImpalaTimestamp); + break; + case TimeUnit::SECOND: + CONV_CASE_LOOP(internal::SecondsToImpalaTimestamp); + break; + } +} + +#undef CONV_CASE_LOOP + template <> Status ArrowColumnWriter::WriteNullableBatch( const ::arrow::TimestampType& type, int64_t num_values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const int64_t* values) { - Int96* buffer; + Int96* buffer = nullptr; RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); - if (type.unit() == TimeUnit::NANO) { - for (int i = 0; i < num_values; i++) { - internal::NanosecondsToImpalaTimestamp(values[i], &buffer[i]); - } - } else { - return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing"); - } + + ConvertArrowTimestampToParquetInt96(values, num_values, type.unit(), buffer); + return WriteBatchSpaced(num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, buffer); } @@ -592,15 +638,11 @@ template <> Status ArrowColumnWriter::WriteNonNullableBatch( const ::arrow::TimestampType& type, int64_t num_values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const int64_t* values) { - Int96* buffer; + Int96* buffer = nullptr; RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); - if (type.unit() == TimeUnit::NANO) { - for (int i = 0; i < num_values; i++) { - internal::NanosecondsToImpalaTimestamp(values[i], buffer + i); - } - } else { - return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing"); - } + + ConvertArrowTimestampToParquetInt96(values, num_values, type.unit(), buffer); + return WriteBatch(num_levels, def_levels, rep_levels, buffer); } @@ -611,21 +653,15 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level const bool is_nanosecond = type.unit() == TimeUnit::NANO; - // In the case where support_deprecated_int96_timestamps was specified - // and coerce_timestamps_enabled was specified, a nanosecond column - // will have a physical type of int64. In that case, we fall through - // to the else if below. - // - // See https://issues.apache.org/jira/browse/ARROW-2082 - if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps() && - !ctx_->properties->coerce_timestamps_enabled()) { + if (ctx_->properties->support_deprecated_int96_timestamps()) { + // The user explicitly required to use Int96 storage. return TypedWriteBatch(values, num_levels, def_levels, rep_levels); } else if (is_nanosecond || (ctx_->properties->coerce_timestamps_enabled() && (type.unit() != ctx_->properties->coerce_timestamps_unit()))) { // Casting is required. This covers several cases - // * Nanoseconds -> cast to microseconds + // * Nanoseconds -> cast to microseconds (until ARROW-3729 is resolved) // * coerce_timestamps_enabled_, cast all timestamps to requested unit return WriteTimestampsCoerce(ctx_->properties->truncated_timestamps_allowed(), values, num_levels, def_levels, rep_levels); @@ -656,10 +692,8 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ auto DivideBy = [&](const int64_t factor) { for (int64_t i = 0; i < array.length(); i++) { if (!truncated_timestamps_allowed && !data.IsNull(i) && (values[i] % factor != 0)) { - std::stringstream ss; - ss << "Casting from " << type.ToString() << " to " << target_type->ToString() - << " would lose data: " << values[i]; - return Status::Invalid(ss.str()); + return Status::Invalid("Casting from ", type.ToString(), " to ", + target_type->ToString(), " would lose data: ", values[i]); } buffer[i] = values[i] / factor; } @@ -861,6 +895,11 @@ Status ArrowColumnWriter::TypedWriteBatch( } Status ArrowColumnWriter::Write(const Array& data) { + if (data.length() == 0) { + // Write nothing when length is 0 + return Status::OK(); + } + ::arrow::Type::type values_type; RETURN_NOT_OK(GetLeafType(*data.type(), &values_type)); @@ -925,9 +964,8 @@ Status ArrowColumnWriter::Write(const Array& data) { default: break; } - std::stringstream ss; - ss << "Data type not supported as list value: " << values_array->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Data type not supported as list value: ", + values_array->type()->ToString()); } } // namespace @@ -1112,22 +1150,32 @@ Status WriteFileMetaData(const FileMetaData& file_metadata, namespace {} // namespace Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) { - if (chunk_size <= 0) { + if (chunk_size <= 0 && table.num_rows() > 0) { return Status::Invalid("chunk size per row_group must be greater than 0"); } else if (chunk_size > impl_->properties().max_row_group_length()) { chunk_size = impl_->properties().max_row_group_length(); } - for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) { - int64_t offset = chunk * chunk_size; - int64_t size = std::min(chunk_size, table.num_rows() - offset); - - RETURN_NOT_OK_ELSE(NewRowGroup(size), PARQUET_IGNORE_NOT_OK(Close())); + auto WriteRowGroup = [&](int64_t offset, int64_t size) { + RETURN_NOT_OK(NewRowGroup(size)); for (int i = 0; i < table.num_columns(); i++) { auto chunked_data = table.column(i)->data(); - RETURN_NOT_OK_ELSE(WriteColumnChunk(chunked_data, offset, size), - PARQUET_IGNORE_NOT_OK(Close())); + RETURN_NOT_OK(WriteColumnChunk(chunked_data, offset, size)); } + return Status::OK(); + }; + + if (table.num_rows() == 0) { + // Append a row group with 0 rows + RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close())); + return Status::OK(); + } + + for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) { + int64_t offset = chunk * chunk_size; + RETURN_NOT_OK_ELSE( + WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)), + PARQUET_IGNORE_NOT_OK(Close())); } return Status::OK(); } diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index 2538c028002e4..ab3d7e815cc9a 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -18,26 +18,37 @@ #ifndef PARQUET_ARROW_WRITER_H #define PARQUET_ARROW_WRITER_H +#include #include -#include "parquet/api/schema.h" -#include "parquet/api/writer.h" +#include "parquet/properties.h" +#include "parquet/types.h" +#include "parquet/util/visibility.h" -#include "arrow/io/interfaces.h" #include "arrow/type.h" namespace arrow { class Array; +class ChunkedArray; class MemoryPool; -class PrimitiveArray; -class Schema; class Status; -class StringArray; class Table; + +namespace io { + +class OutputStream; + +} // namespace io + } // namespace arrow namespace parquet { + +class FileMetaData; +class OutputStream; +class ParquetFileWriter; + namespace arrow { class PARQUET_EXPORT ArrowWriterProperties { @@ -45,19 +56,19 @@ class PARQUET_EXPORT ArrowWriterProperties { class Builder { public: Builder() - : write_nanos_as_int96_(false), + : write_timestamps_as_int96_(false), coerce_timestamps_enabled_(false), coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), truncated_timestamps_allowed_(false) {} virtual ~Builder() {} Builder* disable_deprecated_int96_timestamps() { - write_nanos_as_int96_ = false; + write_timestamps_as_int96_ = false; return this; } Builder* enable_deprecated_int96_timestamps() { - write_nanos_as_int96_ = true; + write_timestamps_as_int96_ = true; return this; } @@ -79,19 +90,19 @@ class PARQUET_EXPORT ArrowWriterProperties { std::shared_ptr build() { return std::shared_ptr(new ArrowWriterProperties( - write_nanos_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, + write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, truncated_timestamps_allowed_)); } private: - bool write_nanos_as_int96_; + bool write_timestamps_as_int96_; bool coerce_timestamps_enabled_; ::arrow::TimeUnit::type coerce_timestamps_unit_; bool truncated_timestamps_allowed_; }; - bool support_deprecated_int96_timestamps() const { return write_nanos_as_int96_; } + bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } ::arrow::TimeUnit::type coerce_timestamps_unit() const { @@ -105,12 +116,12 @@ class PARQUET_EXPORT ArrowWriterProperties { bool coerce_timestamps_enabled, ::arrow::TimeUnit::type coerce_timestamps_unit, bool truncated_timestamps_allowed) - : write_nanos_as_int96_(write_nanos_as_int96), + : write_timestamps_as_int96_(write_nanos_as_int96), coerce_timestamps_enabled_(coerce_timestamps_enabled), coerce_timestamps_unit_(coerce_timestamps_unit), truncated_timestamps_allowed_(truncated_timestamps_allowed) {} - const bool write_nanos_as_int96_; + const bool write_timestamps_as_int96_; const bool coerce_timestamps_enabled_; const ::arrow::TimeUnit::type coerce_timestamps_unit_; const bool truncated_timestamps_allowed_; @@ -208,24 +219,52 @@ namespace internal { * Timestamp conversion constants */ constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); -constexpr int64_t kNanosecondsPerDay = INT64_C(86400000000000); -/** - * Converts nanosecond timestamps to Impala (Int96) format - */ -inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, - Int96* impala_timestamp) { - int64_t julian_days = (nanoseconds / kNanosecondsPerDay) + kJulianEpochOffsetDays; +template +inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { + int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; (*impala_timestamp).value[2] = (uint32_t)julian_days; - int64_t last_day_nanos = nanoseconds % kNanosecondsPerDay; + int64_t last_day_units = time % UnitPerDay; int64_t* impala_last_day_nanos = reinterpret_cast(impala_timestamp); - *impala_last_day_nanos = last_day_nanos; + *impala_last_day_nanos = last_day_units * NanosecondsPerUnit; +} + +constexpr int64_t kSecondsInNanos = INT64_C(1000000000); + +inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp(seconds, + impala_timestamp); +} + +constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000); + +inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + milliseconds, impala_timestamp); +} + +constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000); + +inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + microseconds, impala_timestamp); +} + +constexpr int64_t kNanosecondsInNanos = INT64_C(1); + +inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + nanoseconds, impala_timestamp); } } // namespace internal } // namespace arrow + } // namespace parquet #endif // PARQUET_ARROW_WRITER_H diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index 31a33fa782a7b..8f5f695fde71f 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -15,17 +15,16 @@ // specific language governing permissions and limitations // under the License. -#include -#include #include +#include -#include "arrow/status.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/util/bit-util.h" #include "arrow/util/logging.h" #include "parquet/bloom_filter.h" #include "parquet/exception.h" #include "parquet/murmur3.h" -#include "parquet/types.h" namespace parquet { constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock]; diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 918780e04971a..a586dc2dcced6 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -18,18 +18,24 @@ #ifndef PARQUET_BLOOM_FILTER_H #define PARQUET_BLOOM_FILTER_H +#include #include #include +#include "arrow/util/bit-util.h" #include "arrow/util/logging.h" -#include "parquet/exception.h" #include "parquet/hasher.h" #include "parquet/types.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" +namespace arrow { + +class MemoryPool; + +} // namespace arrow + namespace parquet { -class OutputStream; // A Bloom filter is a compact structure to indicate whether an item is not in a set or // probably in a set. The Bloom filter usually consists of a bit set that represents a @@ -98,7 +104,8 @@ class PARQUET_EXPORT BloomFilter { /// Compute hash for fixed byte array value by using its plain encoding result. /// - /// @param value the value to hash. + /// @param value the value address. + /// @param len the value length. /// @return hash result. virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; @@ -154,11 +161,13 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { DCHECK(fpp > 0.0 && fpp < 1.0); const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); - uint32_t num_bits = static_cast(m); + uint32_t num_bits; // Handle overflow. if (m < 0 || m > kMaximumBloomFilterBytes << 3) { num_bits = static_cast(kMaximumBloomFilterBytes << 3); + } else { + num_bits = static_cast(m); } // Round up to lower bound @@ -183,6 +192,7 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { void InsertHash(uint64_t hash) override; void WriteTo(OutputStream* sink) const override; uint32_t GetBitsetSize() const override { return num_bytes_; } + uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); } uint64_t Hash(float value) const override { return hasher_->Hash(value); } uint64_t Hash(double value) const override { return hasher_->Hash(value); } @@ -192,6 +202,7 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { uint64_t Hash(const FLBA* value, uint32_t len) const override { return hasher_->Hash(value, len); } + /// Deserialize the Bloom filter from an input stream. It is used when reconstructing /// a Bloom filter from a parquet filter. /// diff --git a/cpp/src/parquet/column-io-benchmark.cc b/cpp/src/parquet/column-io-benchmark.cc index 8f286f4910000..3e32b2a837815 100644 --- a/cpp/src/parquet/column-io-benchmark.cc +++ b/cpp/src/parquet/column-io-benchmark.cc @@ -20,6 +20,7 @@ #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/file_reader.h" +#include "parquet/metadata.h" #include "parquet/thrift.h" #include "parquet/util/memory.h" @@ -35,8 +36,8 @@ std::unique_ptr BuildWriter(int64_t output_size, OutputStream* dst, const WriterProperties* properties) { std::unique_ptr pager = PageWriter::Open(dst, Compression::UNCOMPRESSED, metadata); - return std::unique_ptr( - new Int64Writer(metadata, std::move(pager), Encoding::PLAIN, properties)); + return std::unique_ptr(new Int64Writer( + metadata, std::move(pager), false /*use_dictionary*/, Encoding::PLAIN, properties)); } std::shared_ptr Int64Schema(Repetition::type repetition) { diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc index 60f2be2362510..0475ca591de02 100644 --- a/cpp/src/parquet/column_reader-test.cc +++ b/cpp/src/parquet/column_reader-test.cc @@ -102,7 +102,7 @@ class TestPrimitiveReader : public ::testing::Test { &vresult[0] + total_values_read, &values_read)); total_values_read += static_cast(values_read); batch_actual += batch; - batch_size = std::max(batch_size * 2, 4096); + batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096)); } while (batch > 0); ASSERT_EQ(num_levels_, batch_actual); @@ -147,7 +147,7 @@ class TestPrimitiveReader : public ::testing::Test { total_values_read += batch - static_cast(null_count); batch_actual += batch; levels_actual += static_cast(levels_read); - batch_size = std::max(batch_size * 2, 4096); + batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096)); } while ((batch > 0) || (levels_read > 0)); ASSERT_EQ(num_levels_, levels_actual); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 7fbf9babd71fa..113d50a40aada 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -17,20 +17,22 @@ #include "parquet/column_reader.h" -#include #include +#include +#include #include -#include -#include -#include -#include -#include -#include +#include "arrow/buffer.h" +#include "arrow/util/bit-stream-utils.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/compression.h" +#include "arrow/util/logging.h" +#include "arrow/util/rle-encoding.h" #include "parquet/column_page.h" -#include "parquet/encoding-internal.h" +#include "parquet/encoding.h" #include "parquet/properties.h" +#include "parquet/statistics.h" #include "parquet/thrift.h" using arrow::MemoryPool; @@ -290,18 +292,17 @@ void TypedColumnReader::ConfigureDictionary(const DictionaryPage* page) { if (page->encoding() == Encoding::PLAIN_DICTIONARY || page->encoding() == Encoding::PLAIN) { - PlainDecoder dictionary(descr_); - dictionary.SetData(page->num_values(), page->data(), page->size()); + auto dictionary = MakeTypedDecoder(Encoding::PLAIN, descr_); + dictionary->SetData(page->num_values(), page->data(), page->size()); - // The dictionary is fully decoded during DictionaryDecoder::Init, so the + // The dictionary is fully decoded during SetData, so the // DictionaryPage buffer is no longer required after this step // // TODO(wesm): investigate whether this all-or-nothing decoding of the // dictionary makes sense and whether performance can be improved - - auto decoder = std::make_shared>(descr_, pool_); - decoder->SetDict(&dictionary); - decoders_[encoding] = decoder; + auto decoder = MakeDictDecoder(descr_, pool_); + decoder->SetDict(dictionary.get()); + decoders_[encoding] = std::move(decoder); } else { ParquetException::NYI("only plain dictionary encoding has been implemented"); } @@ -385,9 +386,9 @@ bool TypedColumnReader::ReadNewPage() { } else { switch (encoding) { case Encoding::PLAIN: { - std::shared_ptr decoder(new PlainDecoder(descr_)); - decoders_[static_cast(encoding)] = decoder; + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_); current_decoder_ = decoder.get(); + decoders_[static_cast(encoding)] = std::move(decoder); break; } case Encoding::RLE_DICTIONARY: diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 42bf900c97932..19513c210d327 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -19,26 +19,20 @@ #define PARQUET_COLUMN_READER_H #include -#include #include -#include -#include #include #include #include -#include -#include -#include -#include -#include +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" -#include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -56,6 +50,9 @@ class RleDecoder; namespace parquet { +class DictionaryPage; +class Page; + // 16 MB is the default maximum page header size static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; @@ -290,7 +287,7 @@ class PARQUET_TEMPLATE_CLASS_EXPORT TypedColumnReader : public ColumnReader { int64_t Skip(int64_t num_rows_to_skip); private: - typedef Decoder DecoderType; + using DecoderType = TypedDecoder; // Advance to the next data page bool ReadNewPage() override; @@ -312,10 +309,9 @@ class PARQUET_TEMPLATE_CLASS_EXPORT TypedColumnReader : public ColumnReader { // Map of encoding type to the respective decoder object. For example, a // column chunk's data pages may include both dictionary-encoded and // plain-encoded data. - std::unordered_map> decoders_; + std::unordered_map> decoders_; void ConfigureDictionary(const DictionaryPage* page); - DecoderType* current_decoder_; }; diff --git a/cpp/src/parquet/column_scanner.cc b/cpp/src/parquet/column_scanner.cc index 51c87732959fb..8011318a78c9a 100644 --- a/cpp/src/parquet/column_scanner.cc +++ b/cpp/src/parquet/column_scanner.cc @@ -21,7 +21,6 @@ #include #include "parquet/column_reader.h" -#include "parquet/util/memory.h" using arrow::MemoryPool; diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h index f23c86173cb32..cb0da2c9e18f1 100644 --- a/cpp/src/parquet/column_scanner.h +++ b/cpp/src/parquet/column_scanner.h @@ -25,11 +25,13 @@ #include #include +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" + #include "parquet/column_reader.h" #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" -#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -87,7 +89,7 @@ class PARQUET_EXPORT Scanner { }; template -class PARQUET_EXPORT TypedScanner : public Scanner { +class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner { public: typedef typename DType::c_type T; diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index 4416e3d18e9ad..1f034b622719a 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -21,6 +21,8 @@ #include "parquet/column_reader.h" #include "parquet/column_writer.h" +#include "parquet/metadata.h" +#include "parquet/properties.h" #include "parquet/test-specialization.h" #include "parquet/test-util.h" #include "parquet/thrift.h" @@ -43,11 +45,15 @@ const int SMALL_SIZE = 100; const int LARGE_SIZE = 10000; // Very large size to test dictionary fallback. const int VERY_LARGE_SIZE = 40000; +// Reduced dictionary page size to use for testing dictionary fallback with valgrind +const int64_t DICTIONARY_PAGE_SIZE = 1024; #else // Larger size to test some corner cases, only used in some specific cases. const int LARGE_SIZE = 100000; // Very large size to test dictionary fallback. const int VERY_LARGE_SIZE = 400000; +// Dictionary page size to use for testing dictionary fallback +const int64_t DICTIONARY_PAGE_SIZE = 1024 * 1024; #endif template @@ -79,12 +85,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { std::shared_ptr> BuildWriter( int64_t output_size = SMALL_SIZE, - const ColumnProperties& column_properties = ColumnProperties()) { + const ColumnProperties& column_properties = ColumnProperties(), + const ParquetVersion::type version = ParquetVersion::PARQUET_1_0) { sink_.reset(new InMemoryOutputStream()); WriterProperties::Builder wp_builder; + wp_builder.version(version); if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY || column_properties.encoding() == Encoding::RLE_DICTIONARY) { wp_builder.enable_dictionary(); + wp_builder.dictionary_pagesize_limit(DICTIONARY_PAGE_SIZE); } else { wp_builder.disable_dictionary(); wp_builder.encoding(column_properties.encoding()); @@ -128,6 +137,50 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows)); } + void TestDictionaryFallbackEncoding(ParquetVersion::type version) { + this->GenerateData(VERY_LARGE_SIZE); + ColumnProperties column_properties; + column_properties.set_dictionary_enabled(true); + + if (version == ParquetVersion::PARQUET_1_0) { + column_properties.set_encoding(Encoding::PLAIN_DICTIONARY); + } else { + column_properties.set_encoding(Encoding::RLE_DICTIONARY); + } + + auto writer = this->BuildWriter(VERY_LARGE_SIZE, column_properties, version); + + writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); + writer->Close(); + + // Read all rows so we are sure that also the non-dictionary pages are read correctly + this->SetupValuesOut(VERY_LARGE_SIZE); + this->ReadColumnFully(); + ASSERT_EQ(VERY_LARGE_SIZE, this->values_read_); + this->values_.resize(VERY_LARGE_SIZE); + ASSERT_EQ(this->values_, this->values_out_); + std::vector encodings = this->metadata_encodings(); + + if (this->type_num() == Type::BOOLEAN) { + // Dictionary encoding is not allowed for boolean type + // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case + std::vector expected({Encoding::PLAIN, Encoding::RLE}); + ASSERT_EQ(encodings, expected); + } else if (version == ParquetVersion::PARQUET_1_0) { + // There are 4 encodings (PLAIN_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case + // for version 1.0 + std::vector expected( + {Encoding::PLAIN_DICTIONARY, Encoding::PLAIN, Encoding::RLE, Encoding::PLAIN}); + ASSERT_EQ(encodings, expected); + } else { + // There are 4 encodings (RLE_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case for + // version 2.0 + std::vector expected( + {Encoding::RLE_DICTIONARY, Encoding::PLAIN, Encoding::RLE, Encoding::PLAIN}); + ASSERT_EQ(encodings, expected); + } + } + void WriteRequiredWithSettings(Encoding::type encoding, Compression::type compression, bool enable_dictionary, bool enable_statistics, int64_t num_rows) { @@ -349,11 +402,6 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithLz4Compression) { LARGE_SIZE); } -TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithZstdCompression) { - this->TestRequiredWithSettings(Encoding::PLAIN, Compression::ZSTD, false, false, - LARGE_SIZE); -} - TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStats) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::UNCOMPRESSED, false, true, LARGE_SIZE); @@ -379,10 +427,19 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndLz4Compression) { LARGE_SIZE); } +// The ExternalProject for zstd does not build on CMake < 3.7, so we do not +// require it here +#ifdef ARROW_WITH_ZSTD +TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithZstdCompression) { + this->TestRequiredWithSettings(Encoding::PLAIN, Compression::ZSTD, false, false, + LARGE_SIZE); +} + TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndZstdCompression) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::ZSTD, false, true, LARGE_SIZE); } +#endif TYPED_TEST(TestPrimitiveWriter, Optional) { // Optional and non-repeated, with definition levels @@ -474,32 +531,13 @@ TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) { ASSERT_EQ(this->values_, this->values_out_); } -// Test case for dictionary fallback encoding -TYPED_TEST(TestPrimitiveWriter, RequiredVeryLargeChunk) { - this->GenerateData(VERY_LARGE_SIZE); - - auto writer = this->BuildWriter(VERY_LARGE_SIZE, Encoding::PLAIN_DICTIONARY); - writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); - writer->Close(); +// Test cases for dictionary fallback encoding +TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) { + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0); +} - // Read all rows so we are sure that also the non-dictionary pages are read correctly - this->SetupValuesOut(VERY_LARGE_SIZE); - this->ReadColumnFully(); - ASSERT_EQ(VERY_LARGE_SIZE, this->values_read_); - this->values_.resize(VERY_LARGE_SIZE); - ASSERT_EQ(this->values_, this->values_out_); - std::vector encodings = this->metadata_encodings(); - // There are 3 encodings (RLE, PLAIN_DICTIONARY, PLAIN) in a fallback case - // Dictionary encoding is not allowed for boolean type - // There are 2 encodings (RLE, PLAIN) in a non dictionary encoding case - if (this->type_num() != Type::BOOLEAN) { - ASSERT_EQ(Encoding::PLAIN_DICTIONARY, encodings[0]); - ASSERT_EQ(Encoding::PLAIN, encodings[1]); - ASSERT_EQ(Encoding::RLE, encodings[2]); - } else { - ASSERT_EQ(Encoding::PLAIN, encodings[0]); - ASSERT_EQ(Encoding::RLE, encodings[1]); - } +TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) { + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_0); } // PARQUET-719 diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 37fce9c036b31..0919a3f1d7a65 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -17,23 +17,32 @@ #include "parquet/column_writer.h" +#include #include #include #include +#include "arrow/status.h" +#include "arrow/util/bit-stream-utils.h" #include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" -#include "parquet/encoding-internal.h" +#include "parquet/metadata.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" +#include "parquet/types.h" #include "parquet/util/memory.h" namespace parquet { +namespace BitUtil = ::arrow::BitUtil; + +using ::arrow::internal::checked_cast; + using BitWriter = ::arrow::BitUtil::BitWriter; using RleEncoder = ::arrow::util::RleEncoder; @@ -141,6 +150,7 @@ class SerializedPageWriter : public PageWriter { total_uncompressed_size_(0), total_compressed_size_(0) { compressor_ = GetCodecFromArrow(codec); + thrift_serializer_.reset(new ThriftSerializer); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -171,8 +181,7 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), sink_); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_); sink_->Write(compressed_data->data(), compressed_data->size()); total_uncompressed_size_ += uncompressed_size + header_size; @@ -237,8 +246,7 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), sink_); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_); sink_->Write(compressed_data->data(), compressed_data->size()); total_uncompressed_size_ += uncompressed_size + header_size; @@ -270,6 +278,8 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size_; int64_t total_compressed_size_; + std::unique_ptr thrift_serializer_; + // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; }; @@ -533,23 +543,12 @@ void ColumnWriter::FlushBufferedDataPages() { template TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, std::unique_ptr pager, + const bool use_dictionary, Encoding::type encoding, const WriterProperties* properties) - : ColumnWriter(metadata, std::move(pager), - (encoding == Encoding::PLAIN_DICTIONARY || - encoding == Encoding::RLE_DICTIONARY), - encoding, properties) { - switch (encoding) { - case Encoding::PLAIN: - current_encoder_.reset(new PlainEncoder(descr_, properties->memory_pool())); - break; - case Encoding::PLAIN_DICTIONARY: - case Encoding::RLE_DICTIONARY: - current_encoder_.reset(new DictEncoder(descr_, properties->memory_pool())); - break; - default: - ParquetException::NYI("Selected encoding is not supported"); - } + : ColumnWriter(metadata, std::move(pager), use_dictionary, encoding, properties) { + current_encoder_ = MakeEncoder(Type::type_num, encoding, use_dictionary, descr_, + properties->memory_pool()); if (properties->statistics_enabled(descr_->path()) && (SortOrder::UNKNOWN != descr_->sort_order())) { @@ -562,27 +561,33 @@ TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, // Fallback to PLAIN if dictionary page limit is reached. template void TypedColumnWriter::CheckDictionarySizeLimit() { - auto dict_encoder = static_cast*>(current_encoder_.get()); + // We have to dynamic cast here because TypedEncoder as some compilers + // don't want to cast through virtual inheritance + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { WriteDictionaryPage(); // Serialize the buffered Dictionary Indicies FlushBufferedDataPages(); fallback_ = true; // Only PLAIN encoding is supported for fallback in V1 - current_encoder_.reset(new PlainEncoder(descr_, properties_->memory_pool())); + current_encoder_ = MakeEncoder(Type::type_num, Encoding::PLAIN, false, descr_, + properties_->memory_pool()); encoding_ = Encoding::PLAIN; } } template void TypedColumnWriter::WriteDictionaryPage() { - auto dict_encoder = static_cast*>(current_encoder_.get()); + // We have to dynamic cast here because TypedEncoder as some compilers + // don't want to cast through virtual inheritance + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); + DCHECK(dict_encoder); std::shared_ptr buffer = AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size()); dict_encoder->WriteDict(buffer->mutable_data()); DictionaryPage page(buffer, dict_encoder->num_entries(), - properties_->dictionary_index_encoding()); + properties_->dictionary_page_encoding()); total_bytes_written_ += pager_->WriteDictionaryPage(page); } @@ -615,36 +620,37 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met std::unique_ptr pager, const WriterProperties* properties) { const ColumnDescriptor* descr = metadata->descr(); + const bool use_dictionary = properties->dictionary_enabled(descr->path()) && + descr->physical_type() != Type::BOOLEAN; Encoding::type encoding = properties->encoding(descr->path()); - if (properties->dictionary_enabled(descr->path()) && - descr->physical_type() != Type::BOOLEAN) { - encoding = properties->dictionary_page_encoding(); + if (use_dictionary) { + encoding = properties->dictionary_index_encoding(); } switch (descr->physical_type()) { case Type::BOOLEAN: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::INT32: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::INT64: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::INT96: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::FLOAT: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::DOUBLE: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::BYTE_ARRAY: - return std::make_shared(metadata, std::move(pager), encoding, - properties); + return std::make_shared(metadata, std::move(pager), use_dictionary, + encoding, properties); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared(metadata, std::move(pager), - encoding, properties); + return std::make_shared( + metadata, std::move(pager), use_dictionary, encoding, properties); default: ParquetException::NYI("type reader not implemented"); } @@ -840,7 +846,8 @@ void TypedColumnWriter::WriteBatchSpaced( template void TypedColumnWriter::WriteValues(int64_t num_values, const T* values) { - current_encoder_->Put(values, static_cast(num_values)); + dynamic_cast(current_encoder_.get()) + ->Put(values, static_cast(num_values)); } template @@ -848,8 +855,8 @@ void TypedColumnWriter::WriteValuesSpaced(int64_t num_values, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) { - current_encoder_->PutSpaced(values, static_cast(num_values), valid_bits, - valid_bits_offset); + dynamic_cast(current_encoder_.get()) + ->PutSpaced(values, static_cast(num_values), valid_bits, valid_bits_offset); } template class PARQUET_TEMPLATE_EXPORT TypedColumnWriter; diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index e665ca718ffa5..254bf0dd02e50 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -17,17 +17,18 @@ #pragma once +#include #include #include +#include "arrow/memory_pool.h" + #include "parquet/column_page.h" #include "parquet/encoding.h" -#include "parquet/metadata.h" -#include "parquet/properties.h" +#include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/types.h" -#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -45,6 +46,9 @@ class RleEncoder; namespace parquet { +class ColumnChunkMetaDataBuilder; +class WriterProperties; + class PARQUET_EXPORT LevelEncoder { public: LevelEncoder(); @@ -235,8 +239,8 @@ class PARQUET_TEMPLATE_CLASS_EXPORT TypedColumnWriter : public ColumnWriter { typedef typename DType::c_type T; TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, - std::unique_ptr pager, Encoding::type encoding, - const WriterProperties* properties); + std::unique_ptr pager, const bool use_dictionary, + Encoding::type encoding, const WriterProperties* properties); // Write a batch of repetition levels, definition levels, and values to the // column. @@ -297,13 +301,13 @@ class PARQUET_TEMPLATE_CLASS_EXPORT TypedColumnWriter : public ColumnWriter { int64_t valid_bits_offset, const T* values, int64_t* num_spaced_written); - typedef Encoder EncoderType; - // Write values to a temporary buffer before they are encoded into pages void WriteValues(int64_t num_values, const T* values); void WriteValuesSpaced(int64_t num_values, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values); - std::unique_ptr current_encoder_; + + using ValueEncoderType = typename EncodingTraits::Encoder; + std::unique_ptr current_encoder_; typedef TypedRowGroupStatistics TypedStats; std::unique_ptr page_statistics_; diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index f8d2839af7ca7..8031aeb7ce168 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -17,7 +17,8 @@ #include "benchmark/benchmark.h" -#include "parquet/encoding-internal.h" +#include "parquet/encoding.h" +#include "parquet/schema.h" #include "parquet/util/memory.h" using arrow::default_memory_pool; @@ -27,39 +28,39 @@ namespace parquet { using schema::PrimitiveNode; -namespace benchmark { - std::shared_ptr Int64Schema(Repetition::type repetition) { auto node = PrimitiveNode::Make("int64", repetition, Type::INT64); return std::make_shared(node, repetition != Repetition::REQUIRED, repetition == Repetition::REPEATED); } -static void BM_PlainEncodingBoolean(::benchmark::State& state) { - std::vector values(state.range(0), 64); - PlainEncoder encoder(nullptr); +static void BM_PlainEncodingBoolean(benchmark::State& state) { + std::vector values(state.range(0), true); + auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::PLAIN); + auto typed_encoder = dynamic_cast(encoder.get()); while (state.KeepRunning()) { - encoder.Put(values, static_cast(values.size())); - encoder.FlushValues(); + typed_encoder->Put(values, static_cast(values.size())); + typed_encoder->FlushValues(); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); } BENCHMARK(BM_PlainEncodingBoolean)->Range(1024, 65536); -static void BM_PlainDecodingBoolean(::benchmark::State& state) { - std::vector values(state.range(0), 64); +static void BM_PlainDecodingBoolean(benchmark::State& state) { + std::vector values(state.range(0), true); bool* output = new bool[state.range(0)]; - PlainEncoder encoder(nullptr); - encoder.Put(values, static_cast(values.size())); - std::shared_ptr buf = encoder.FlushValues(); + auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::PLAIN); + auto typed_encoder = dynamic_cast(encoder.get()); + typed_encoder->Put(values, static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); while (state.KeepRunning()) { - PlainDecoder decoder(nullptr); - decoder.SetData(static_cast(values.size()), buf->data(), - static_cast(buf->size())); - decoder.Decode(output, static_cast(values.size())); + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(static_cast(values.size()), buf->data(), + static_cast(buf->size())); + decoder->Decode(output, static_cast(values.size())); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); @@ -68,30 +69,29 @@ static void BM_PlainDecodingBoolean(::benchmark::State& state) { BENCHMARK(BM_PlainDecodingBoolean)->Range(1024, 65536); -static void BM_PlainEncodingInt64(::benchmark::State& state) { +static void BM_PlainEncodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); - PlainEncoder encoder(nullptr); - + auto encoder = MakeTypedEncoder(Encoding::PLAIN); while (state.KeepRunning()) { - encoder.Put(values.data(), static_cast(values.size())); - encoder.FlushValues(); + encoder->Put(values.data(), static_cast(values.size())); + encoder->FlushValues(); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(int64_t)); } BENCHMARK(BM_PlainEncodingInt64)->Range(1024, 65536); -static void BM_PlainDecodingInt64(::benchmark::State& state) { +static void BM_PlainDecodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); - PlainEncoder encoder(nullptr); - encoder.Put(values.data(), static_cast(values.size())); - std::shared_ptr buf = encoder.FlushValues(); + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + encoder->Put(values.data(), static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); while (state.KeepRunning()) { - PlainDecoder decoder(nullptr); - decoder.SetData(static_cast(values.size()), buf->data(), - static_cast(buf->size())); - decoder.Decode(values.data(), static_cast(values.size())); + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(static_cast(values.size()), buf->data(), + static_cast(buf->size())); + decoder->Decode(values.data(), static_cast(values.size())); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(int64_t)); } @@ -100,44 +100,47 @@ BENCHMARK(BM_PlainDecodingInt64)->Range(1024, 65536); template static void DecodeDict(std::vector& values, - ::benchmark::State& state) { + benchmark::State& state) { typedef typename Type::c_type T; int num_values = static_cast(values.size()); MemoryPool* allocator = default_memory_pool(); std::shared_ptr descr = Int64Schema(Repetition::REQUIRED); - DictEncoder encoder(descr.get(), allocator); - for (int i = 0; i < num_values; ++i) { - encoder.Put(values[i]); - } + auto base_encoder = + MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr.get(), allocator); + auto encoder = + dynamic_cast::Encoder*>(base_encoder.get()); + auto dict_traits = dynamic_cast*>(base_encoder.get()); + encoder->Put(values.data(), num_values); std::shared_ptr dict_buffer = - AllocateBuffer(allocator, encoder.dict_encoded_size()); + AllocateBuffer(allocator, dict_traits->dict_encoded_size()); std::shared_ptr indices = - AllocateBuffer(allocator, encoder.EstimatedDataEncodedSize()); + AllocateBuffer(allocator, encoder->EstimatedDataEncodedSize()); - encoder.WriteDict(dict_buffer->mutable_data()); - int actual_bytes = - encoder.WriteIndices(indices->mutable_data(), static_cast(indices->size())); + dict_traits->WriteDict(dict_buffer->mutable_data()); + int actual_bytes = dict_traits->WriteIndices(indices->mutable_data(), + static_cast(indices->size())); PARQUET_THROW_NOT_OK(indices->Resize(actual_bytes)); while (state.KeepRunning()) { - PlainDecoder dict_decoder(descr.get()); - dict_decoder.SetData(encoder.num_entries(), dict_buffer->data(), - static_cast(dict_buffer->size())); - DictionaryDecoder decoder(descr.get()); - decoder.SetDict(&dict_decoder); - decoder.SetData(num_values, indices->data(), static_cast(indices->size())); - decoder.Decode(values.data(), num_values); + auto dict_decoder = MakeTypedDecoder(Encoding::PLAIN, descr.get()); + dict_decoder->SetData(dict_traits->num_entries(), dict_buffer->data(), + static_cast(dict_buffer->size())); + + auto decoder = MakeDictDecoder(descr.get()); + decoder->SetDict(dict_decoder.get()); + decoder->SetData(num_values, indices->data(), static_cast(indices->size())); + decoder->Decode(values.data(), num_values); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(T)); } -static void BM_DictDecodingInt64_repeats(::benchmark::State& state) { +static void BM_DictDecodingInt64_repeats(benchmark::State& state) { typedef Int64Type Type; typedef typename Type::c_type T; @@ -147,7 +150,7 @@ static void BM_DictDecodingInt64_repeats(::benchmark::State& state) { BENCHMARK(BM_DictDecodingInt64_repeats)->Range(1024, 65536); -static void BM_DictDecodingInt64_literals(::benchmark::State& state) { +static void BM_DictDecodingInt64_literals(benchmark::State& state) { typedef Int64Type Type; typedef typename Type::c_type T; @@ -160,6 +163,4 @@ static void BM_DictDecodingInt64_literals(::benchmark::State& state) { BENCHMARK(BM_DictDecodingInt64_literals)->Range(1024, 65536); -} // namespace benchmark - } // namespace parquet diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h deleted file mode 100644 index e2dfc2380ddcf..0000000000000 --- a/cpp/src/parquet/encoding-internal.h +++ /dev/null @@ -1,850 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ENCODING_INTERNAL_H -#define PARQUET_ENCODING_INTERNAL_H - -#include -#include -#include -#include -#include - -#include "arrow/util/bit-stream-utils.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/hashing.h" -#include "arrow/util/macros.h" -#include "arrow/util/rle-encoding.h" - -#include "parquet/encoding.h" -#include "parquet/exception.h" -#include "parquet/schema.h" -#include "parquet/types.h" -#include "parquet/util/memory.h" - -namespace parquet { - -namespace BitUtil = ::arrow::BitUtil; - -class ColumnDescriptor; - -// ---------------------------------------------------------------------- -// Encoding::PLAIN decoder implementation - -template -class PlainDecoder : public Decoder { - public: - typedef typename DType::c_type T; - using Decoder::num_values_; - - explicit PlainDecoder(const ColumnDescriptor* descr) - : Decoder(descr, Encoding::PLAIN), data_(nullptr), len_(0) { - if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { - type_length_ = descr_->type_length(); - } else { - type_length_ = -1; - } - } - - virtual void SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; - data_ = data; - len_ = len; - } - - virtual int Decode(T* buffer, int max_values); - - private: - using Decoder::descr_; - const uint8_t* data_; - int len_; - int type_length_; -}; - -// Decode routine templated on C++ type rather than type enum -template -inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, - int type_length, T* out) { - int bytes_to_decode = num_values * static_cast(sizeof(T)); - if (data_size < bytes_to_decode) { - ParquetException::EofException(); - } - memcpy(out, data, bytes_to_decode); - return bytes_to_decode; -} - -// Template specialization for BYTE_ARRAY. The written values do not own their -// own data. -template <> -inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, - int type_length, ByteArray* out) { - int bytes_decoded = 0; - int increment; - for (int i = 0; i < num_values; ++i) { - uint32_t len = out[i].len = *reinterpret_cast(data); - increment = static_cast(sizeof(uint32_t) + len); - if (data_size < increment) ParquetException::EofException(); - out[i].ptr = data + sizeof(uint32_t); - data += increment; - data_size -= increment; - bytes_decoded += increment; - } - return bytes_decoded; -} - -// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not -// own their own data. -template <> -inline int DecodePlain(const uint8_t* data, int64_t data_size, - int num_values, int type_length, - FixedLenByteArray* out) { - int bytes_to_decode = type_length * num_values; - if (data_size < bytes_to_decode) { - ParquetException::EofException(); - } - for (int i = 0; i < num_values; ++i) { - out[i].ptr = data; - data += type_length; - data_size -= type_length; - } - return bytes_to_decode; -} - -template -inline int PlainDecoder::Decode(T* buffer, int max_values) { - max_values = std::min(max_values, num_values_); - int bytes_consumed = DecodePlain(data_, len_, max_values, type_length_, buffer); - data_ += bytes_consumed; - len_ -= bytes_consumed; - num_values_ -= max_values; - return max_values; -} - -template <> -class PlainDecoder : public Decoder { - public: - explicit PlainDecoder(const ColumnDescriptor* descr) - : Decoder(descr, Encoding::PLAIN) {} - - virtual void SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; - bit_reader_ = BitUtil::BitReader(data, len); - } - - // Two flavors of bool decoding - int Decode(uint8_t* buffer, int max_values) { - max_values = std::min(max_values, num_values_); - bool val; - ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); - for (int i = 0; i < max_values; ++i) { - if (!bit_reader_.GetValue(1, &val)) { - ParquetException::EofException(); - } - if (val) { - bit_writer.Set(); - } - bit_writer.Next(); - } - bit_writer.Finish(); - num_values_ -= max_values; - return max_values; - } - - virtual int Decode(bool* buffer, int max_values) { - max_values = std::min(max_values, num_values_); - if (bit_reader_.GetBatch(1, buffer, max_values) != max_values) { - ParquetException::EofException(); - } - num_values_ -= max_values; - return max_values; - } - - private: - BitUtil::BitReader bit_reader_; -}; - -// ---------------------------------------------------------------------- -// Encoding::PLAIN encoder implementation - -template -class PlainEncoder : public Encoder { - public: - typedef typename DType::c_type T; - - explicit PlainEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Encoder(descr, Encoding::PLAIN, pool) { - values_sink_.reset(new InMemoryOutputStream(pool)); - } - - int64_t EstimatedDataEncodedSize() override { return values_sink_->Tell(); } - - std::shared_ptr FlushValues() override; - void Put(const T* src, int num_values) override; - - protected: - std::unique_ptr values_sink_; -}; - -template <> -class PlainEncoder : public Encoder { - public: - explicit PlainEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Encoder(descr, Encoding::PLAIN, pool), - bits_available_(kInMemoryDefaultCapacity * 8), - bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)), - values_sink_(new InMemoryOutputStream(pool)) { - bit_writer_.reset(new BitUtil::BitWriter(bits_buffer_->mutable_data(), - static_cast(bits_buffer_->size()))); - } - - int64_t EstimatedDataEncodedSize() override { - return values_sink_->Tell() + bit_writer_->bytes_written(); - } - - std::shared_ptr FlushValues() override { - if (bits_available_ > 0) { - bit_writer_->Flush(); - values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); - bit_writer_->Clear(); - bits_available_ = static_cast(bits_buffer_->size()) * 8; - } - - std::shared_ptr buffer = values_sink_->GetBuffer(); - values_sink_.reset(new InMemoryOutputStream(this->pool_)); - return buffer; - } - -#define PLAINDECODER_BOOLEAN_PUT(input_type, function_attributes) \ - void Put(input_type src, int num_values) function_attributes { \ - int bit_offset = 0; \ - if (bits_available_ > 0) { \ - int bits_to_write = std::min(bits_available_, num_values); \ - for (int i = 0; i < bits_to_write; i++) { \ - bit_writer_->PutValue(src[i], 1); \ - } \ - bits_available_ -= bits_to_write; \ - bit_offset = bits_to_write; \ - \ - if (bits_available_ == 0) { \ - bit_writer_->Flush(); \ - values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); \ - bit_writer_->Clear(); \ - } \ - } \ - \ - int bits_remaining = num_values - bit_offset; \ - while (bit_offset < num_values) { \ - bits_available_ = static_cast(bits_buffer_->size()) * 8; \ - \ - int bits_to_write = std::min(bits_available_, bits_remaining); \ - for (int i = bit_offset; i < bit_offset + bits_to_write; i++) { \ - bit_writer_->PutValue(src[i], 1); \ - } \ - bit_offset += bits_to_write; \ - bits_available_ -= bits_to_write; \ - bits_remaining -= bits_to_write; \ - \ - if (bits_available_ == 0) { \ - bit_writer_->Flush(); \ - values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); \ - bit_writer_->Clear(); \ - } \ - } \ - } - - PLAINDECODER_BOOLEAN_PUT(const bool*, override) - PLAINDECODER_BOOLEAN_PUT(const std::vector&, ) - - protected: - int bits_available_; - std::unique_ptr bit_writer_; - std::shared_ptr bits_buffer_; - std::unique_ptr values_sink_; -}; - -template -inline std::shared_ptr PlainEncoder::FlushValues() { - std::shared_ptr buffer = values_sink_->GetBuffer(); - values_sink_.reset(new InMemoryOutputStream(this->pool_)); - return buffer; -} - -template -inline void PlainEncoder::Put(const T* buffer, int num_values) { - values_sink_->Write(reinterpret_cast(buffer), num_values * sizeof(T)); -} - -template <> -inline void PlainEncoder::Put(const ByteArray* src, int num_values) { - for (int i = 0; i < num_values; ++i) { - // Write the result to the output stream - values_sink_->Write(reinterpret_cast(&src[i].len), sizeof(uint32_t)); - if (src[i].len > 0) { - DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL"; - } - values_sink_->Write(reinterpret_cast(src[i].ptr), src[i].len); - } -} - -template <> -inline void PlainEncoder::Put(const FixedLenByteArray* src, int num_values) { - for (int i = 0; i < num_values; ++i) { - // Write the result to the output stream - if (descr_->type_length() > 0) { - DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL"; - } - values_sink_->Write(reinterpret_cast(src[i].ptr), - descr_->type_length()); - } -} - -// ---------------------------------------------------------------------- -// Dictionary encoding and decoding - -template -class DictionaryDecoder : public Decoder { - public: - typedef typename Type::c_type T; - - // Initializes the dictionary with values from 'dictionary'. The data in - // dictionary is not guaranteed to persist in memory after this call so the - // dictionary decoder needs to copy the data out if necessary. - explicit DictionaryDecoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Decoder(descr, Encoding::RLE_DICTIONARY), - dictionary_(0, pool), - byte_array_data_(AllocateBuffer(pool, 0)) {} - - // Perform type-specific initiatialization - void SetDict(Decoder* dictionary); - - void SetData(int num_values, const uint8_t* data, int len) override { - num_values_ = num_values; - if (len == 0) return; - uint8_t bit_width = *data; - ++data; - --len; - idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width); - } - - int Decode(T* buffer, int max_values) override { - max_values = std::min(max_values, num_values_); - int decoded_values = - idx_decoder_.GetBatchWithDict(dictionary_.data(), buffer, max_values); - if (decoded_values != max_values) { - ParquetException::EofException(); - } - num_values_ -= max_values; - return max_values; - } - - int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset) override { - int decoded_values = - idx_decoder_.GetBatchWithDictSpaced(dictionary_.data(), buffer, num_values, - null_count, valid_bits, valid_bits_offset); - if (decoded_values != num_values) { - ParquetException::EofException(); - } - return decoded_values; - } - - private: - using Decoder::num_values_; - - // Only one is set. - Vector dictionary_; - - // Data that contains the byte array data (byte_array_dictionary_ just has the - // pointers). - std::shared_ptr byte_array_data_; - - ::arrow::util::RleDecoder idx_decoder_; -}; - -template -inline void DictionaryDecoder::SetDict(Decoder* dictionary) { - int num_dictionary_values = dictionary->values_left(); - dictionary_.Resize(num_dictionary_values); - dictionary->Decode(&dictionary_[0], num_dictionary_values); -} - -template <> -inline void DictionaryDecoder::SetDict(Decoder* dictionary) { - ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); -} - -template <> -inline void DictionaryDecoder::SetDict( - Decoder* dictionary) { - int num_dictionary_values = dictionary->values_left(); - dictionary_.Resize(num_dictionary_values); - dictionary->Decode(&dictionary_[0], num_dictionary_values); - - int total_size = 0; - for (int i = 0; i < num_dictionary_values; ++i) { - total_size += dictionary_[i].len; - } - if (total_size > 0) { - PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false)); - } - - int offset = 0; - uint8_t* bytes_data = byte_array_data_->mutable_data(); - for (int i = 0; i < num_dictionary_values; ++i) { - memcpy(bytes_data + offset, dictionary_[i].ptr, dictionary_[i].len); - dictionary_[i].ptr = bytes_data + offset; - offset += dictionary_[i].len; - } -} - -template <> -inline void DictionaryDecoder::SetDict(Decoder* dictionary) { - int num_dictionary_values = dictionary->values_left(); - dictionary_.Resize(num_dictionary_values); - dictionary->Decode(&dictionary_[0], num_dictionary_values); - - int fixed_len = descr_->type_length(); - int total_size = num_dictionary_values * fixed_len; - - PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false)); - uint8_t* bytes_data = byte_array_data_->mutable_data(); - for (int32_t i = 0, offset = 0; i < num_dictionary_values; ++i, offset += fixed_len) { - memcpy(bytes_data + offset, dictionary_[i].ptr, fixed_len); - dictionary_[i].ptr = bytes_data + offset; - } -} - -// ---------------------------------------------------------------------- -// Dictionary encoder - -template -struct DictEncoderTraits { - using c_type = typename DType::c_type; - using MemoTableType = ::arrow::internal::ScalarMemoTable; -}; - -template <> -struct DictEncoderTraits { - using MemoTableType = ::arrow::internal::BinaryMemoTable; -}; - -template <> -struct DictEncoderTraits { - using MemoTableType = ::arrow::internal::BinaryMemoTable; -}; - -// Initially 1024 elements -static constexpr int32_t INITIAL_HASH_TABLE_SIZE = 1 << 10; - -/// See the dictionary encoding section of https://github.com/Parquet/parquet-format. -/// The encoding supports streaming encoding. Values are encoded as they are added while -/// the dictionary is being constructed. At any time, the buffered values can be -/// written out with the current dictionary size. More values can then be added to -/// the encoder, including new dictionary entries. -template -class DictEncoder : public Encoder { - using MemoTableType = typename DictEncoderTraits::MemoTableType; - - public: - typedef typename DType::c_type T; - - explicit DictEncoder(const ColumnDescriptor* desc, - ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool()) - : Encoder(desc, Encoding::PLAIN_DICTIONARY, allocator), - allocator_(allocator), - dict_encoded_size_(0), - type_length_(desc->type_length()), - memo_table_(INITIAL_HASH_TABLE_SIZE) {} - - ~DictEncoder() override { DCHECK(buffered_indices_.empty()); } - - void set_type_length(int type_length) { type_length_ = type_length; } - - /// Returns a conservative estimate of the number of bytes needed to encode the buffered - /// indices. Used to size the buffer passed to WriteIndices(). - int64_t EstimatedDataEncodedSize() override { - // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to - // reserve - // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used - // but not reserving them would cause the encoder to fail. - return 1 + - ::arrow::util::RleEncoder::MaxBufferSize( - bit_width(), static_cast(buffered_indices_.size())) + - ::arrow::util::RleEncoder::MinBufferSize(bit_width()); - } - - /// The minimum bit width required to encode the currently buffered indices. - int bit_width() const { - if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0; - if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1; - return BitUtil::Log2(num_entries()); - } - - /// Writes out any buffered indices to buffer preceded by the bit width of this data. - /// Returns the number of bytes written. - /// If the supplied buffer is not big enough, returns -1. - /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize() - /// to size buffer. - int WriteIndices(uint8_t* buffer, int buffer_len); - - int dict_encoded_size() { return dict_encoded_size_; } - - /// Encode value. Note that this does not actually write any data, just - /// buffers the value's index to be written later. - inline void Put(const T& value); - void Put(const T* values, int num_values) override; - - std::shared_ptr FlushValues() override { - std::shared_ptr buffer = - AllocateBuffer(this->allocator_, EstimatedDataEncodedSize()); - int result_size = WriteIndices(buffer->mutable_data(), - static_cast(EstimatedDataEncodedSize())); - PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); - return buffer; - } - - void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset) override { - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); - for (int32_t i = 0; i < num_values; i++) { - if (valid_bits_reader.IsSet()) { - Put(src[i]); - } - valid_bits_reader.Next(); - } - } - - /// Writes out the encoded dictionary to buffer. buffer must be preallocated to - /// dict_encoded_size() bytes. - void WriteDict(uint8_t* buffer); - - /// The number of entries in the dictionary. - int num_entries() const { return memo_table_.size(); } - - private: - /// Clears all the indices (but leaves the dictionary). - void ClearIndices() { buffered_indices_.clear(); } - - ::arrow::MemoryPool* allocator_; - - /// Indices that have not yet be written out by WriteIndices(). - std::vector buffered_indices_; - - /// The number of bytes needed to encode the dictionary. - int dict_encoded_size_; - - /// Size of each encoded dictionary value. -1 for variable-length types. - int type_length_; - - MemoTableType memo_table_; -}; - -template -void DictEncoder::Put(const T* src, int num_values) { - for (int32_t i = 0; i < num_values; i++) { - Put(src[i]); - } -} - -template -inline void DictEncoder::Put(const T& v) { - // Put() implementation for primitive types - auto on_found = [](int32_t memo_index) {}; - auto on_not_found = [this](int32_t memo_index) { - dict_encoded_size_ += static_cast(sizeof(T)); - }; - - auto memo_index = memo_table_.GetOrInsert(v, on_found, on_not_found); - buffered_indices_.push_back(memo_index); -} - -template <> -inline void DictEncoder::Put(const ByteArray& v) { - static const uint8_t empty[] = {0}; - - auto on_found = [](int32_t memo_index) {}; - auto on_not_found = [&](int32_t memo_index) { - dict_encoded_size_ += static_cast(v.len + sizeof(uint32_t)); - }; - - DCHECK(v.ptr != nullptr || v.len == 0); - const void* ptr = (v.ptr != nullptr) ? v.ptr : empty; - auto memo_index = - memo_table_.GetOrInsert(ptr, static_cast(v.len), on_found, on_not_found); - buffered_indices_.push_back(memo_index); -} - -template <> -inline void DictEncoder::Put(const FixedLenByteArray& v) { - static const uint8_t empty[] = {0}; - - auto on_found = [](int32_t memo_index) {}; - auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; }; - - DCHECK(v.ptr != nullptr || type_length_ == 0); - const void* ptr = (v.ptr != nullptr) ? v.ptr : empty; - auto memo_index = memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found); - buffered_indices_.push_back(memo_index); -} - -template -inline void DictEncoder::WriteDict(uint8_t* buffer) { - // For primitive types, only a memcpy - DCHECK_EQ(static_cast(dict_encoded_size_), sizeof(T) * memo_table_.size()); - memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast(buffer)); -} - -// ByteArray and FLBA already have the dictionary encoded in their data heaps -template <> -inline void DictEncoder::WriteDict(uint8_t* buffer) { - memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) { - uint32_t len = static_cast(v.length()); - memcpy(buffer, &len, sizeof(uint32_t)); - buffer += sizeof(uint32_t); - memcpy(buffer, v.data(), v.length()); - buffer += v.length(); - }); -} - -template <> -inline void DictEncoder::WriteDict(uint8_t* buffer) { - memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) { - DCHECK_EQ(v.length(), static_cast(type_length_)); - memcpy(buffer, v.data(), type_length_); - buffer += type_length_; - }); -} - -template -inline int DictEncoder::WriteIndices(uint8_t* buffer, int buffer_len) { - // Write bit width in first byte - *buffer = static_cast(bit_width()); - ++buffer; - --buffer_len; - - ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); - for (int index : buffered_indices_) { - if (!encoder.Put(index)) return -1; - } - encoder.Flush(); - - ClearIndices(); - return 1 + encoder.len(); -} - -// ---------------------------------------------------------------------- -// DeltaBitPackDecoder - -template -class DeltaBitPackDecoder : public Decoder { - public: - typedef typename DType::c_type T; - - explicit DeltaBitPackDecoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Decoder(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) { - if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) { - throw ParquetException("Delta bit pack encoding should only be for integer data."); - } - } - - virtual void SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; - decoder_ = BitUtil::BitReader(data, len); - values_current_block_ = 0; - values_current_mini_block_ = 0; - } - - virtual int Decode(T* buffer, int max_values) { - return GetInternal(buffer, max_values); - } - - private: - using Decoder::num_values_; - - void InitBlock() { - int32_t block_size; - if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException(); - if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); - if (!decoder_.GetVlqInt(&values_current_block_)) { - ParquetException::EofException(); - } - if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException(); - - delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_); - uint8_t* bit_width_data = delta_bit_widths_->mutable_data(); - - if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException(); - for (int i = 0; i < num_mini_blocks_; ++i) { - if (!decoder_.GetAligned(1, bit_width_data + i)) { - ParquetException::EofException(); - } - } - values_per_mini_block_ = block_size / num_mini_blocks_; - mini_block_idx_ = 0; - delta_bit_width_ = bit_width_data[0]; - values_current_mini_block_ = values_per_mini_block_; - } - - template - int GetInternal(T* buffer, int max_values) { - max_values = std::min(max_values, num_values_); - const uint8_t* bit_width_data = delta_bit_widths_->data(); - for (int i = 0; i < max_values; ++i) { - if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) { - ++mini_block_idx_; - if (mini_block_idx_ < static_cast(delta_bit_widths_->size())) { - delta_bit_width_ = bit_width_data[mini_block_idx_]; - values_current_mini_block_ = values_per_mini_block_; - } else { - InitBlock(); - buffer[i] = last_value_; - continue; - } - } - - // TODO: the key to this algorithm is to decode the entire miniblock at once. - int64_t delta; - if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException(); - delta += min_delta_; - last_value_ += static_cast(delta); - buffer[i] = last_value_; - --values_current_mini_block_; - } - num_values_ -= max_values; - return max_values; - } - - ::arrow::MemoryPool* pool_; - BitUtil::BitReader decoder_; - int32_t values_current_block_; - int32_t num_mini_blocks_; - uint64_t values_per_mini_block_; - uint64_t values_current_mini_block_; - - int32_t min_delta_; - size_t mini_block_idx_; - std::shared_ptr delta_bit_widths_; - int delta_bit_width_; - - int32_t last_value_; -}; - -// ---------------------------------------------------------------------- -// DELTA_LENGTH_BYTE_ARRAY - -class DeltaLengthByteArrayDecoder : public Decoder { - public: - explicit DeltaLengthByteArrayDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Decoder(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), - len_decoder_(nullptr, pool) {} - - virtual void SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; - if (len == 0) return; - int total_lengths_len = *reinterpret_cast(data); - data += 4; - len_decoder_.SetData(num_values, data, total_lengths_len); - data_ = data + total_lengths_len; - len_ = len - 4 - total_lengths_len; - } - - virtual int Decode(ByteArray* buffer, int max_values) { - max_values = std::min(max_values, num_values_); - std::vector lengths(max_values); - len_decoder_.Decode(lengths.data(), max_values); - for (int i = 0; i < max_values; ++i) { - buffer[i].len = lengths[i]; - buffer[i].ptr = data_; - data_ += lengths[i]; - len_ -= lengths[i]; - } - num_values_ -= max_values; - return max_values; - } - - private: - using Decoder::num_values_; - DeltaBitPackDecoder len_decoder_; - const uint8_t* data_; - int len_; -}; - -// ---------------------------------------------------------------------- -// DELTA_BYTE_ARRAY - -class DeltaByteArrayDecoder : public Decoder { - public: - explicit DeltaByteArrayDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Decoder(descr, Encoding::DELTA_BYTE_ARRAY), - prefix_len_decoder_(nullptr, pool), - suffix_decoder_(nullptr, pool), - last_value_(0, nullptr) {} - - virtual void SetData(int num_values, const uint8_t* data, int len) { - num_values_ = num_values; - if (len == 0) return; - int prefix_len_length = *reinterpret_cast(data); - data += 4; - len -= 4; - prefix_len_decoder_.SetData(num_values, data, prefix_len_length); - data += prefix_len_length; - len -= prefix_len_length; - suffix_decoder_.SetData(num_values, data, len); - } - - // TODO: this doesn't work and requires memory management. We need to allocate - // new strings to store the results. - virtual int Decode(ByteArray* buffer, int max_values) { - max_values = std::min(max_values, num_values_); - for (int i = 0; i < max_values; ++i) { - int prefix_len = 0; - prefix_len_decoder_.Decode(&prefix_len, 1); - ByteArray suffix = {0, nullptr}; - suffix_decoder_.Decode(&suffix, 1); - buffer[i].len = prefix_len + suffix.len; - - uint8_t* result = reinterpret_cast(malloc(buffer[i].len)); - memcpy(result, last_value_.ptr, prefix_len); - memcpy(result + prefix_len, suffix.ptr, suffix.len); - - buffer[i].ptr = result; - last_value_ = buffer[i]; - } - num_values_ -= max_values; - return max_values; - } - - private: - using Decoder::num_values_; - - DeltaBitPackDecoder prefix_len_decoder_; - DeltaLengthByteArrayDecoder suffix_decoder_; - ByteArray last_value_; -}; - -} // namespace parquet - -#endif // PARQUET_ENCODING_INTERNAL_H diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 90ceb7828b139..28d98126ec84a 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -24,7 +24,7 @@ #include "arrow/util/bit-util.h" -#include "parquet/encoding-internal.h" +#include "parquet/encoding.h" #include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/memory.h" @@ -43,29 +43,31 @@ namespace test { TEST(VectorBooleanTest, TestEncodeDecode) { // PARQUET-454 int nvalues = 10000; - int nbytes = static_cast(BitUtil::BytesForBits(nvalues)); + int nbytes = static_cast(::arrow::BitUtil::BytesForBits(nvalues)); // seed the prng so failure is deterministic vector draws = flip_coins_seed(nvalues, 0.5, 0); - PlainEncoder encoder(nullptr); - PlainDecoder decoder(nullptr); + std::unique_ptr encoder = + MakeTypedEncoder(Encoding::PLAIN); + encoder->Put(draws, nvalues); - encoder.Put(draws, nvalues); + std::unique_ptr decoder = + MakeTypedDecoder(Encoding::PLAIN); - std::shared_ptr encode_buffer = encoder.FlushValues(); + std::shared_ptr encode_buffer = encoder->FlushValues(); ASSERT_EQ(nbytes, encode_buffer->size()); vector decode_buffer(nbytes); const uint8_t* decode_data = &decode_buffer[0]; - decoder.SetData(nvalues, encode_buffer->data(), - static_cast(encode_buffer->size())); - int values_decoded = decoder.Decode(&decode_buffer[0], nvalues); + decoder->SetData(nvalues, encode_buffer->data(), + static_cast(encode_buffer->size())); + int values_decoded = decoder->Decode(&decode_buffer[0], nvalues); ASSERT_EQ(nvalues, values_decoded); for (int i = 0; i < nvalues; ++i) { - ASSERT_EQ(draws[i], BitUtil::GetBit(decode_data, i)) << i; + ASSERT_EQ(draws[i], ::arrow::BitUtil::GetBit(decode_data, i)) << i; } } @@ -214,14 +216,14 @@ class TestPlainEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; virtual void CheckRoundtrip() { - PlainEncoder encoder(descr_.get()); - PlainDecoder decoder(descr_.get()); - encoder.Put(draws_, num_values_); - encode_buffer_ = encoder.FlushValues(); - - decoder.SetData(num_values_, encode_buffer_->data(), - static_cast(encode_buffer_->size())); - int values_decoded = decoder.Decode(decode_buf_, num_values_); + auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, descr_.get()); + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); ASSERT_EQ(num_values_, values_decoded); ASSERT_NO_FATAL_FAILURE(VerifyResults(decode_buf_, draws_, num_values_)); } @@ -250,29 +252,38 @@ class TestDictionaryEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void CheckRoundtrip() { - std::vector valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255); - DictEncoder encoder(descr_.get()); + std::vector valid_bits(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); - ASSERT_NO_THROW(encoder.Put(draws_, num_values_)); - dict_buffer_ = AllocateBuffer(default_memory_pool(), encoder.dict_encoded_size()); - encoder.WriteDict(dict_buffer_->mutable_data()); - std::shared_ptr indices = encoder.FlushValues(); + auto base_encoder = MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); + auto encoder = + dynamic_cast::Encoder*>(base_encoder.get()); + auto dict_traits = dynamic_cast*>(base_encoder.get()); + + ASSERT_NO_THROW(encoder->Put(draws_, num_values_)); + dict_buffer_ = + AllocateBuffer(default_memory_pool(), dict_traits->dict_encoded_size()); + dict_traits->WriteDict(dict_buffer_->mutable_data()); + std::shared_ptr indices = encoder->FlushValues(); + + auto base_spaced_encoder = + MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); + auto spaced_encoder = + dynamic_cast::Encoder*>(base_spaced_encoder.get()); - DictEncoder spaced_encoder(descr_.get()); // PutSpaced should lead to the same results - ASSERT_NO_THROW(spaced_encoder.PutSpaced(draws_, num_values_, valid_bits.data(), 0)); - std::shared_ptr indices_from_spaced = spaced_encoder.FlushValues(); + ASSERT_NO_THROW(spaced_encoder->PutSpaced(draws_, num_values_, valid_bits.data(), 0)); + std::shared_ptr indices_from_spaced = spaced_encoder->FlushValues(); ASSERT_TRUE(indices_from_spaced->Equals(*indices)); - PlainDecoder dict_decoder(descr_.get()); - dict_decoder.SetData(encoder.num_entries(), dict_buffer_->data(), - static_cast(dict_buffer_->size())); + auto dict_decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + dict_decoder->SetData(dict_traits->num_entries(), dict_buffer_->data(), + static_cast(dict_buffer_->size())); - DictionaryDecoder decoder(descr_.get()); - decoder.SetDict(&dict_decoder); + auto decoder = MakeDictDecoder(descr_.get()); + decoder->SetDict(dict_decoder.get()); - decoder.SetData(num_values_, indices->data(), static_cast(indices->size())); - int values_decoded = decoder.Decode(decode_buf_, num_values_); + decoder->SetData(num_values_, indices->data(), static_cast(indices->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); ASSERT_EQ(num_values_, values_decoded); // TODO(wesm): The DictionaryDecoder must stay alive because the decoded @@ -281,9 +292,9 @@ class TestDictionaryEncoding : public TestEncodingBase { ASSERT_NO_FATAL_FAILURE(VerifyResults(decode_buf_, draws_, num_values_)); // Also test spaced decoding - decoder.SetData(num_values_, indices->data(), static_cast(indices->size())); + decoder->SetData(num_values_, indices->data(), static_cast(indices->size())); values_decoded = - decoder.DecodeSpaced(decode_buf_, num_values_, 0, valid_bits.data(), 0); + decoder->DecodeSpaced(decode_buf_, num_values_, 0, valid_bits.data(), 0); ASSERT_EQ(num_values_, values_decoded); ASSERT_NO_FATAL_FAILURE(VerifyResults(decode_buf_, draws_, num_values_)); } @@ -300,10 +311,7 @@ TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) { } TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { - PlainDecoder dict_decoder(nullptr); - DictionaryDecoder decoder(nullptr); - - ASSERT_THROW(decoder.SetDict(&dict_decoder), ParquetException); + ASSERT_THROW(MakeDictDecoder(nullptr), ParquetException); } } // namespace test diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc new file mode 100644 index 0000000000000..3fd3ceca4c5e2 --- /dev/null +++ b/cpp/src/parquet/encoding.cc @@ -0,0 +1,1279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encoding.h" + +#include +#include +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/util/bit-stream-utils.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/hashing.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/rle-encoding.h" +#include "arrow/util/string_view.h" + +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "parquet/util/memory.h" + +namespace parquet { + +namespace BitUtil = ::arrow::BitUtil; + +class EncoderImpl : virtual public Encoder { + public: + EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, + ::arrow::MemoryPool* pool) + : descr_(descr), + encoding_(encoding), + pool_(pool), + type_length_(descr ? descr->type_length() : -1) {} + + Encoding::type encoding() const override { return encoding_; } + + ::arrow::MemoryPool* memory_pool() const override { return pool_; } + + protected: + // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY + const ColumnDescriptor* descr_; + const Encoding::type encoding_; + ::arrow::MemoryPool* pool_; + + /// Type length from descr + int type_length_; +}; + +// ---------------------------------------------------------------------- +// Plain encoder implementation + +template +class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + using T = typename DType::c_type; + + explicit PlainEncoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + int64_t EstimatedDataEncodedSize() override; + std::shared_ptr FlushValues() override; + + void Put(const T* buffer, int num_values) override; + + protected: + std::unique_ptr values_sink_; +}; + +template +PlainEncoder::PlainEncoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) + : EncoderImpl(descr, Encoding::PLAIN, pool) { + values_sink_.reset(new InMemoryOutputStream(pool)); +} +template +int64_t PlainEncoder::EstimatedDataEncodedSize() { + return values_sink_->Tell(); +} + +template +std::shared_ptr PlainEncoder::FlushValues() { + std::shared_ptr buffer = values_sink_->GetBuffer(); + values_sink_.reset(new InMemoryOutputStream(this->pool_)); + return buffer; +} + +template +void PlainEncoder::Put(const T* buffer, int num_values) { + values_sink_->Write(reinterpret_cast(buffer), num_values * sizeof(T)); +} + +template <> +inline void PlainEncoder::Put(const ByteArray* src, int num_values) { + for (int i = 0; i < num_values; ++i) { + // Write the result to the output stream + values_sink_->Write(reinterpret_cast(&src[i].len), sizeof(uint32_t)); + if (src[i].len > 0) { + DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL"; + } + values_sink_->Write(reinterpret_cast(src[i].ptr), src[i].len); + } +} + +template <> +inline void PlainEncoder::Put(const FixedLenByteArray* src, int num_values) { + for (int i = 0; i < num_values; ++i) { + // Write the result to the output stream + if (descr_->type_length() > 0) { + DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL"; + } + values_sink_->Write(reinterpret_cast(src[i].ptr), + descr_->type_length()); + } +} + +class PlainByteArrayEncoder : public PlainEncoder, + virtual public ByteArrayEncoder { + public: + using BASE = PlainEncoder; + using BASE::PlainEncoder; +}; + +class PlainFLBAEncoder : public PlainEncoder, virtual public FLBAEncoder { + public: + using BASE = PlainEncoder; + using BASE::PlainEncoder; +}; + +class PlainBooleanEncoder : public EncoderImpl, + virtual public TypedEncoder, + virtual public BooleanEncoder { + public: + explicit PlainBooleanEncoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + int64_t EstimatedDataEncodedSize() override; + std::shared_ptr FlushValues() override; + + void Put(const bool* src, int num_values) override; + void Put(const std::vector& src, int num_values) override; + + private: + int bits_available_; + std::unique_ptr<::arrow::BitUtil::BitWriter> bit_writer_; + std::shared_ptr bits_buffer_; + std::unique_ptr values_sink_; + + template + void PutImpl(const SequenceType& src, int num_values); +}; + +template +void PlainBooleanEncoder::PutImpl(const SequenceType& src, int num_values) { + int bit_offset = 0; + if (bits_available_ > 0) { + int bits_to_write = std::min(bits_available_, num_values); + for (int i = 0; i < bits_to_write; i++) { + bit_writer_->PutValue(src[i], 1); + } + bits_available_ -= bits_to_write; + bit_offset = bits_to_write; + + if (bits_available_ == 0) { + bit_writer_->Flush(); + values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); + bit_writer_->Clear(); + } + } + + int bits_remaining = num_values - bit_offset; + while (bit_offset < num_values) { + bits_available_ = static_cast(bits_buffer_->size()) * 8; + + int bits_to_write = std::min(bits_available_, bits_remaining); + for (int i = bit_offset; i < bit_offset + bits_to_write; i++) { + bit_writer_->PutValue(src[i], 1); + } + bit_offset += bits_to_write; + bits_available_ -= bits_to_write; + bits_remaining -= bits_to_write; + + if (bits_available_ == 0) { + bit_writer_->Flush(); + values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); + bit_writer_->Clear(); + } + } +} + +PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) + : EncoderImpl(descr, Encoding::PLAIN, pool), + bits_available_(kInMemoryDefaultCapacity * 8), + bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)), + values_sink_(new InMemoryOutputStream(pool)) { + bit_writer_.reset(new BitUtil::BitWriter(bits_buffer_->mutable_data(), + static_cast(bits_buffer_->size()))); +} + +int64_t PlainBooleanEncoder::EstimatedDataEncodedSize() { + return values_sink_->Tell() + bit_writer_->bytes_written(); +} + +std::shared_ptr PlainBooleanEncoder::FlushValues() { + if (bits_available_ > 0) { + bit_writer_->Flush(); + values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); + bit_writer_->Clear(); + bits_available_ = static_cast(bits_buffer_->size()) * 8; + } + + std::shared_ptr buffer = values_sink_->GetBuffer(); + values_sink_.reset(new InMemoryOutputStream(this->pool_)); + return buffer; +} + +void PlainBooleanEncoder::Put(const bool* src, int num_values) { + PutImpl(src, num_values); +} + +void PlainBooleanEncoder::Put(const std::vector& src, int num_values) { + PutImpl(src, num_values); +} + +// ---------------------------------------------------------------------- +// DictEncoder implementations + +template +struct DictEncoderTraits { + using c_type = typename DType::c_type; + using MemoTableType = ::arrow::internal::ScalarMemoTable; +}; + +template <> +struct DictEncoderTraits { + using MemoTableType = ::arrow::internal::BinaryMemoTable; +}; + +template <> +struct DictEncoderTraits { + using MemoTableType = ::arrow::internal::BinaryMemoTable; +}; + +/// See the dictionary encoding section of https://github.com/Parquet/parquet-format. +/// The encoding supports streaming encoding. Values are encoded as they are added while +/// the dictionary is being constructed. At any time, the buffered values can be +/// written out with the current dictionary size. More values can then be added to +/// the encoder, including new dictionary entries. +template +class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { + using MemoTableType = typename DictEncoderTraits::MemoTableType; + + public: + typedef typename DType::c_type T; + + explicit DictEncoderImpl( + const ColumnDescriptor* desc, + ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool()); + + ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); } + + int dict_encoded_size() override { return dict_encoded_size_; } + + int WriteIndices(uint8_t* buffer, int buffer_len) override { + // Write bit width in first byte + *buffer = static_cast(bit_width()); + ++buffer; + --buffer_len; + + ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); + for (int index : buffered_indices_) { + if (!encoder.Put(index)) return -1; + } + encoder.Flush(); + + ClearIndices(); + return 1 + encoder.len(); + } + + void set_type_length(int type_length) { this->type_length_ = type_length; } + + /// Returns a conservative estimate of the number of bytes needed to encode the buffered + /// indices. Used to size the buffer passed to WriteIndices(). + int64_t EstimatedDataEncodedSize() override; + + /// The minimum bit width required to encode the currently buffered indices. + int bit_width() const override; + + /// Encode value. Note that this does not actually write any data, just + /// buffers the value's index to be written later. + inline void Put(const T& value); + void Put(const T* values, int num_values) override; + + std::shared_ptr FlushValues() override; + + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override; + + /// Writes out the encoded dictionary to buffer. buffer must be preallocated to + /// dict_encoded_size() bytes. + void WriteDict(uint8_t* buffer) override; + + /// The number of entries in the dictionary. + int num_entries() const override { return memo_table_.size(); } + + private: + /// Clears all the indices (but leaves the dictionary). + void ClearIndices() { buffered_indices_.clear(); } + + /// Indices that have not yet be written out by WriteIndices(). + std::vector buffered_indices_; + + /// The number of bytes needed to encode the dictionary. + int dict_encoded_size_; + + MemoTableType memo_table_; +}; + +// Initially 1024 elements +static constexpr int32_t INITIAL_HASH_TABLE_SIZE = 1 << 10; + +template +DictEncoderImpl::DictEncoderImpl(const ColumnDescriptor* desc, + ::arrow::MemoryPool* pool) + : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), + dict_encoded_size_(0), + memo_table_(INITIAL_HASH_TABLE_SIZE) {} + +template +int64_t DictEncoderImpl::EstimatedDataEncodedSize() { + // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to + // reserve + // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used + // but not reserving them would cause the encoder to fail. + return 1 + + ::arrow::util::RleEncoder::MaxBufferSize( + bit_width(), static_cast(buffered_indices_.size())) + + ::arrow::util::RleEncoder::MinBufferSize(bit_width()); +} + +template +int DictEncoderImpl::bit_width() const { + if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0; + if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1; + return BitUtil::Log2(num_entries()); +} + +template +std::shared_ptr DictEncoderImpl::FlushValues() { + std::shared_ptr buffer = + AllocateBuffer(this->pool_, EstimatedDataEncodedSize()); + int result_size = + WriteIndices(buffer->mutable_data(), static_cast(EstimatedDataEncodedSize())); + PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); + return buffer; +} + +template +void DictEncoderImpl::Put(const T* src, int num_values) { + for (int32_t i = 0; i < num_values; i++) { + Put(src[i]); + } +} + +template +void DictEncoderImpl::PutSpaced(const T* src, int num_values, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { + ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + Put(src[i]); + } + valid_bits_reader.Next(); + } +} + +template +void DictEncoderImpl::WriteDict(uint8_t* buffer) { + // For primitive types, only a memcpy + DCHECK_EQ(static_cast(dict_encoded_size_), sizeof(T) * memo_table_.size()); + memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast(buffer)); +} + +// ByteArray and FLBA already have the dictionary encoded in their data heaps +template <> +void DictEncoderImpl::WriteDict(uint8_t* buffer) { + memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) { + uint32_t len = static_cast(v.length()); + memcpy(buffer, &len, sizeof(uint32_t)); + buffer += sizeof(uint32_t); + memcpy(buffer, v.data(), v.length()); + buffer += v.length(); + }); +} + +template <> +void DictEncoderImpl::WriteDict(uint8_t* buffer) { + memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) { + DCHECK_EQ(v.length(), static_cast(type_length_)); + memcpy(buffer, v.data(), type_length_); + buffer += type_length_; + }); +} + +template +inline void DictEncoderImpl::Put(const T& v) { + // Put() implementation for primitive types + auto on_found = [](int32_t memo_index) {}; + auto on_not_found = [this](int32_t memo_index) { + dict_encoded_size_ += static_cast(sizeof(T)); + }; + + auto memo_index = memo_table_.GetOrInsert(v, on_found, on_not_found); + buffered_indices_.push_back(memo_index); +} + +template <> +inline void DictEncoderImpl::Put(const ByteArray& v) { + static const uint8_t empty[] = {0}; + + auto on_found = [](int32_t memo_index) {}; + auto on_not_found = [&](int32_t memo_index) { + dict_encoded_size_ += static_cast(v.len + sizeof(uint32_t)); + }; + + DCHECK(v.ptr != nullptr || v.len == 0); + const void* ptr = (v.ptr != nullptr) ? v.ptr : empty; + auto memo_index = + memo_table_.GetOrInsert(ptr, static_cast(v.len), on_found, on_not_found); + buffered_indices_.push_back(memo_index); +} + +template <> +inline void DictEncoderImpl::Put(const FixedLenByteArray& v) { + static const uint8_t empty[] = {0}; + + auto on_found = [](int32_t memo_index) {}; + auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; }; + + DCHECK(v.ptr != nullptr || type_length_ == 0); + const void* ptr = (v.ptr != nullptr) ? v.ptr : empty; + auto memo_index = memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found); + buffered_indices_.push_back(memo_index); +} + +class DictByteArrayEncoder : public DictEncoderImpl, + virtual public ByteArrayEncoder { + public: + using BASE = DictEncoderImpl; + using BASE::DictEncoderImpl; +}; + +class DictFLBAEncoder : public DictEncoderImpl, virtual public FLBAEncoder { + public: + using BASE = DictEncoderImpl; + using BASE::DictEncoderImpl; +}; + +// ---------------------------------------------------------------------- +// Encoder and decoder factory functions + +std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encoding, + bool use_dictionary, const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) { + if (use_dictionary) { + switch (type_num) { + case Type::INT32: + return std::unique_ptr(new DictEncoderImpl(descr, pool)); + case Type::INT64: + return std::unique_ptr(new DictEncoderImpl(descr, pool)); + case Type::INT96: + return std::unique_ptr(new DictEncoderImpl(descr, pool)); + case Type::FLOAT: + return std::unique_ptr(new DictEncoderImpl(descr, pool)); + case Type::DOUBLE: + return std::unique_ptr(new DictEncoderImpl(descr, pool)); + case Type::BYTE_ARRAY: + return std::unique_ptr(new DictByteArrayEncoder(descr, pool)); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::unique_ptr(new DictFLBAEncoder(descr, pool)); + default: + DCHECK(false) << "Encoder not implemented"; + break; + } + } else if (encoding == Encoding::PLAIN) { + switch (type_num) { + case Type::BOOLEAN: + return std::unique_ptr(new PlainBooleanEncoder(descr, pool)); + case Type::INT32: + return std::unique_ptr(new PlainEncoder(descr, pool)); + case Type::INT64: + return std::unique_ptr(new PlainEncoder(descr, pool)); + case Type::INT96: + return std::unique_ptr(new PlainEncoder(descr, pool)); + case Type::FLOAT: + return std::unique_ptr(new PlainEncoder(descr, pool)); + case Type::DOUBLE: + return std::unique_ptr(new PlainEncoder(descr, pool)); + case Type::BYTE_ARRAY: + return std::unique_ptr(new PlainByteArrayEncoder(descr, pool)); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::unique_ptr(new PlainFLBAEncoder(descr, pool)); + default: + DCHECK(false) << "Encoder not implemented"; + break; + } + } else { + ParquetException::NYI("Selected encoding is not supported"); + } + DCHECK(false) << "Should not be able to reach this code"; + return nullptr; +} + +class DecoderImpl : virtual public Decoder { + public: + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + data_ = data; + len_ = len; + } + + int values_left() const override { return num_values_; } + Encoding::type encoding() const override { return encoding_; } + + protected: + explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding) + : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {} + + // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY + const ColumnDescriptor* descr_; + + const Encoding::type encoding_; + int num_values_; + const uint8_t* data_; + int len_; + int type_length_; +}; + +template +class PlainDecoder : public DecoderImpl, virtual public TypedDecoder { + public: + using T = typename DType::c_type; + explicit PlainDecoder(const ColumnDescriptor* descr); + + int Decode(T* buffer, int max_values) override; +}; + +template +PlainDecoder::PlainDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::PLAIN) { + if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { + type_length_ = descr_->type_length(); + } else { + type_length_ = -1; + } +} + +// Decode routine templated on C++ type rather than type enum +template +inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, + int type_length, T* out) { + int bytes_to_decode = num_values * static_cast(sizeof(T)); + if (data_size < bytes_to_decode) { + ParquetException::EofException(); + } + // If bytes_to_decode == 0, data could be null + if (bytes_to_decode > 0) { + memcpy(out, data, bytes_to_decode); + } + return bytes_to_decode; +} + +// Template specialization for BYTE_ARRAY. The written values do not own their +// own data. +template <> +inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, + int type_length, ByteArray* out) { + int bytes_decoded = 0; + int increment; + for (int i = 0; i < num_values; ++i) { + uint32_t len = out[i].len = *reinterpret_cast(data); + increment = static_cast(sizeof(uint32_t) + len); + if (data_size < increment) ParquetException::EofException(); + out[i].ptr = data + sizeof(uint32_t); + data += increment; + data_size -= increment; + bytes_decoded += increment; + } + return bytes_decoded; +} + +// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not +// own their own data. +template <> +inline int DecodePlain(const uint8_t* data, int64_t data_size, + int num_values, int type_length, + FixedLenByteArray* out) { + int bytes_to_decode = type_length * num_values; + if (data_size < bytes_to_decode) { + ParquetException::EofException(); + } + for (int i = 0; i < num_values; ++i) { + out[i].ptr = data; + data += type_length; + data_size -= type_length; + } + return bytes_to_decode; +} + +template +int PlainDecoder::Decode(T* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + int bytes_consumed = DecodePlain(data_, len_, max_values, type_length_, buffer); + data_ += bytes_consumed; + len_ -= bytes_consumed; + num_values_ -= max_values; + return max_values; +} + +class PlainBooleanDecoder : public DecoderImpl, + virtual public TypedDecoder, + virtual public BooleanDecoder { + public: + explicit PlainBooleanDecoder(const ColumnDescriptor* descr); + void SetData(int num_values, const uint8_t* data, int len) override; + + // Two flavors of bool decoding + int Decode(uint8_t* buffer, int max_values) override; + int Decode(bool* buffer, int max_values) override; + + private: + std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_; +}; + +PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr) + : DecoderImpl(descr, Encoding::PLAIN) {} + +void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) { + num_values_ = num_values; + bit_reader_.reset(new BitUtil::BitReader(data, len)); +} + +int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + bool val; + ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); + for (int i = 0; i < max_values; ++i) { + if (!bit_reader_->GetValue(1, &val)) { + ParquetException::EofException(); + } + if (val) { + bit_writer.Set(); + } + bit_writer.Next(); + } + bit_writer.Finish(); + num_values_ -= max_values; + return max_values; +} + +int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) { + ParquetException::EofException(); + } + num_values_ -= max_values; + return max_values; +} + +class PlainByteArrayDecoder : public PlainDecoder, + virtual public ByteArrayDecoder { + public: + using Base = PlainDecoder; + using Base::DecodeSpaced; + using Base::PlainDecoder; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::internal::ChunkedBinaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK( + DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK( + DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); + return result; + } + + int DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); + return result; + } + + private: + template + ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* out, + int* values_decoded) { + num_values = std::min(num_values, num_values_); + + ARROW_RETURN_NOT_OK(out->Reserve(num_values)); + + ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + int increment; + int i = 0; + const uint8_t* data = data_; + int64_t data_size = len_; + int bytes_decoded = 0; + while (i < num_values) { + if (bit_reader.IsSet()) { + uint32_t len = *reinterpret_cast(data); + increment = static_cast(sizeof(uint32_t) + len); + if (data_size < increment) { + ParquetException::EofException(); + } + ARROW_RETURN_NOT_OK(out->Append(data + sizeof(uint32_t), len)); + data += increment; + data_size -= increment; + bytes_decoded += increment; + ++i; + } else { + ARROW_RETURN_NOT_OK(out->AppendNull()); + } + bit_reader.Next(); + } + + data_ += bytes_decoded; + len_ -= bytes_decoded; + num_values_ -= num_values; + *values_decoded = num_values; + return ::arrow::Status::OK(); + } + + ::arrow::Status DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* out, + int* values_decoded) { + num_values = std::min(num_values, num_values_); + ARROW_RETURN_NOT_OK(out->Reserve(num_values)); + int i = 0; + const uint8_t* data = data_; + int64_t data_size = len_; + int bytes_decoded = 0; + while (i < num_values) { + uint32_t len = *reinterpret_cast(data); + int increment = static_cast(sizeof(uint32_t) + len); + if (data_size < increment) ParquetException::EofException(); + ARROW_RETURN_NOT_OK(out->Append(data + sizeof(uint32_t), len)); + data += increment; + data_size -= increment; + bytes_decoded += increment; + } + + data_ += bytes_decoded; + len_ -= bytes_decoded; + num_values_ -= num_values; + *values_decoded = num_values; + return ::arrow::Status::OK(); + } +}; + +class PlainFLBADecoder : public PlainDecoder, virtual public FLBADecoder { + public: + using Base = PlainDecoder; + using Base::PlainDecoder; +}; + +// ---------------------------------------------------------------------- +// Dictionary encoding and decoding + +template +class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { + public: + typedef typename Type::c_type T; + + // Initializes the dictionary with values from 'dictionary'. The data in + // dictionary is not guaranteed to persist in memory after this call so the + // dictionary decoder needs to copy the data out if necessary. + explicit DictDecoderImpl(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : DecoderImpl(descr, Encoding::RLE_DICTIONARY), + dictionary_(0, pool), + byte_array_data_(AllocateBuffer(pool, 0)) {} + + // Perform type-specific initiatialization + void SetDict(TypedDecoder* dictionary) override; + + void SetData(int num_values, const uint8_t* data, int len) override { + num_values_ = num_values; + if (len == 0) return; + uint8_t bit_width = *data; + ++data; + --len; + idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width); + } + + int Decode(T* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + int decoded_values = + idx_decoder_.GetBatchWithDict(dictionary_.data(), buffer, max_values); + if (decoded_values != max_values) { + ParquetException::EofException(); + } + num_values_ -= max_values; + return max_values; + } + + int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + int decoded_values = + idx_decoder_.GetBatchWithDictSpaced(dictionary_.data(), buffer, num_values, + null_count, valid_bits, valid_bits_offset); + if (decoded_values != num_values) { + ParquetException::EofException(); + } + return decoded_values; + } + + protected: + // Only one is set. + Vector dictionary_; + + // Data that contains the byte array data (byte_array_dictionary_ just has the + // pointers). + std::shared_ptr byte_array_data_; + + ::arrow::util::RleDecoder idx_decoder_; +}; + +template +inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { + int num_dictionary_values = dictionary->values_left(); + dictionary_.Resize(num_dictionary_values); + dictionary->Decode(dictionary_.data(), num_dictionary_values); +} + +template <> +inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { + ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); +} + +template <> +inline void DictDecoderImpl::SetDict( + TypedDecoder* dictionary) { + int num_dictionary_values = dictionary->values_left(); + dictionary_.Resize(num_dictionary_values); + dictionary->Decode(&dictionary_[0], num_dictionary_values); + + int total_size = 0; + for (int i = 0; i < num_dictionary_values; ++i) { + total_size += dictionary_[i].len; + } + if (total_size > 0) { + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false)); + } + + int offset = 0; + uint8_t* bytes_data = byte_array_data_->mutable_data(); + for (int i = 0; i < num_dictionary_values; ++i) { + memcpy(bytes_data + offset, dictionary_[i].ptr, dictionary_[i].len); + dictionary_[i].ptr = bytes_data + offset; + offset += dictionary_[i].len; + } +} + +template <> +inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { + int num_dictionary_values = dictionary->values_left(); + dictionary_.Resize(num_dictionary_values); + dictionary->Decode(&dictionary_[0], num_dictionary_values); + + int fixed_len = descr_->type_length(); + int total_size = num_dictionary_values * fixed_len; + + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false)); + uint8_t* bytes_data = byte_array_data_->mutable_data(); + for (int32_t i = 0, offset = 0; i < num_dictionary_values; ++i, offset += fixed_len) { + memcpy(bytes_data + offset, dictionary_[i].ptr, fixed_len); + dictionary_[i].ptr = bytes_data + offset; + } +} + +class DictByteArrayDecoder : public DictDecoderImpl, + virtual public ByteArrayDecoder { + public: + using BASE = DictDecoderImpl; + using BASE::DictDecoderImpl; + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::internal::ChunkedBinaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK( + DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK( + DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); + return result; + } + + int DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); + return result; + } + + private: + template + ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { + constexpr int32_t buffer_size = 1024; + int32_t indices_buffer[buffer_size]; + + ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + + int values_decoded = 0; + while (values_decoded < num_values) { + bool is_valid = bit_reader.IsSet(); + bit_reader.Next(); + + if (is_valid) { + int32_t batch_size = + std::min(buffer_size, num_values - values_decoded - null_count); + int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); + + int i = 0; + while (true) { + // Consume all indices + if (is_valid) { + const auto& val = dictionary_[indices_buffer[i]]; + ARROW_RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + ++i; + } else { + ARROW_RETURN_NOT_OK(builder->AppendNull()); + --null_count; + } + ++values_decoded; + if (i == num_indices) { + // Do not advance the bit_reader if we have fulfilled the decode + // request + break; + } + is_valid = bit_reader.IsSet(); + bit_reader.Next(); + } + } else { + ARROW_RETURN_NOT_OK(builder->AppendNull()); + --null_count; + ++values_decoded; + } + } + if (values_decoded != num_values) { + return ::arrow::Status::IOError("Expected to dictionary-decode ", num_values, + " but only able to decode ", values_decoded); + } + *out_num_values = values_decoded; + return ::arrow::Status::OK(); + } + + template + ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* out_num_values) { + constexpr int32_t buffer_size = 2048; + int32_t indices_buffer[buffer_size]; + int values_decoded = 0; + while (values_decoded < num_values) { + int num_indices = idx_decoder_.GetBatch(indices_buffer, buffer_size); + if (num_indices == 0) break; + for (int i = 0; i < num_indices; ++i) { + const auto& val = dictionary_[indices_buffer[i]]; + PARQUET_THROW_NOT_OK(builder->Append(val.ptr, val.len)); + } + values_decoded += num_indices; + } + if (values_decoded != num_values) { + ParquetException::EofException(); + } + *out_num_values = values_decoded; + return ::arrow::Status::OK(); + } +}; + +class DictFLBADecoder : public DictDecoderImpl, virtual public FLBADecoder { + public: + using BASE = DictDecoderImpl; + using BASE::DictDecoderImpl; +}; + +// ---------------------------------------------------------------------- +// DeltaBitPackDecoder + +template +class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder { + public: + typedef typename DType::c_type T; + + explicit DeltaBitPackDecoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) { + if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) { + throw ParquetException("Delta bit pack encoding should only be for integer data."); + } + } + + virtual void SetData(int num_values, const uint8_t* data, int len) { + this->num_values_ = num_values; + decoder_ = ::arrow::BitUtil::BitReader(data, len); + values_current_block_ = 0; + values_current_mini_block_ = 0; + } + + virtual int Decode(T* buffer, int max_values) { + return GetInternal(buffer, max_values); + } + + private: + void InitBlock() { + int32_t block_size; + if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException(); + if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); + if (!decoder_.GetVlqInt(&values_current_block_)) { + ParquetException::EofException(); + } + if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException(); + + delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_); + uint8_t* bit_width_data = delta_bit_widths_->mutable_data(); + + if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException(); + for (int i = 0; i < num_mini_blocks_; ++i) { + if (!decoder_.GetAligned(1, bit_width_data + i)) { + ParquetException::EofException(); + } + } + values_per_mini_block_ = block_size / num_mini_blocks_; + mini_block_idx_ = 0; + delta_bit_width_ = bit_width_data[0]; + values_current_mini_block_ = values_per_mini_block_; + } + + template + int GetInternal(T* buffer, int max_values) { + max_values = std::min(max_values, this->num_values_); + const uint8_t* bit_width_data = delta_bit_widths_->data(); + for (int i = 0; i < max_values; ++i) { + if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) { + ++mini_block_idx_; + if (mini_block_idx_ < static_cast(delta_bit_widths_->size())) { + delta_bit_width_ = bit_width_data[mini_block_idx_]; + values_current_mini_block_ = values_per_mini_block_; + } else { + InitBlock(); + buffer[i] = last_value_; + continue; + } + } + + // TODO: the key to this algorithm is to decode the entire miniblock at once. + int64_t delta; + if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException(); + delta += min_delta_; + last_value_ += static_cast(delta); + buffer[i] = last_value_; + --values_current_mini_block_; + } + this->num_values_ -= max_values; + return max_values; + } + + ::arrow::MemoryPool* pool_; + ::arrow::BitUtil::BitReader decoder_; + int32_t values_current_block_; + int32_t num_mini_blocks_; + uint64_t values_per_mini_block_; + uint64_t values_current_mini_block_; + + int32_t min_delta_; + size_t mini_block_idx_; + std::shared_ptr delta_bit_widths_; + int delta_bit_width_; + + int32_t last_value_; +}; + +// ---------------------------------------------------------------------- +// DELTA_LENGTH_BYTE_ARRAY + +class DeltaLengthByteArrayDecoder : public DecoderImpl, + virtual public TypedDecoder { + public: + explicit DeltaLengthByteArrayDecoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), + len_decoder_(nullptr, pool) {} + + virtual void SetData(int num_values, const uint8_t* data, int len) { + num_values_ = num_values; + if (len == 0) return; + int total_lengths_len = *reinterpret_cast(data); + data += 4; + this->len_decoder_.SetData(num_values, data, total_lengths_len); + data_ = data + total_lengths_len; + this->len_ = len - 4 - total_lengths_len; + } + + virtual int Decode(ByteArray* buffer, int max_values) { + max_values = std::min(max_values, num_values_); + std::vector lengths(max_values); + len_decoder_.Decode(lengths.data(), max_values); + for (int i = 0; i < max_values; ++i) { + buffer[i].len = lengths[i]; + buffer[i].ptr = data_; + this->data_ += lengths[i]; + this->len_ -= lengths[i]; + } + this->num_values_ -= max_values; + return max_values; + } + + private: + DeltaBitPackDecoder len_decoder_; +}; + +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY + +class DeltaByteArrayDecoder : public DecoderImpl, + virtual public TypedDecoder { + public: + explicit DeltaByteArrayDecoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), + prefix_len_decoder_(nullptr, pool), + suffix_decoder_(nullptr, pool), + last_value_(0, nullptr) {} + + virtual void SetData(int num_values, const uint8_t* data, int len) { + num_values_ = num_values; + if (len == 0) return; + int prefix_len_length = *reinterpret_cast(data); + data += 4; + len -= 4; + prefix_len_decoder_.SetData(num_values, data, prefix_len_length); + data += prefix_len_length; + len -= prefix_len_length; + suffix_decoder_.SetData(num_values, data, len); + } + + // TODO: this doesn't work and requires memory management. We need to allocate + // new strings to store the results. + virtual int Decode(ByteArray* buffer, int max_values) { + max_values = std::min(max_values, this->num_values_); + for (int i = 0; i < max_values; ++i) { + int prefix_len = 0; + prefix_len_decoder_.Decode(&prefix_len, 1); + ByteArray suffix = {0, nullptr}; + suffix_decoder_.Decode(&suffix, 1); + buffer[i].len = prefix_len + suffix.len; + + uint8_t* result = reinterpret_cast(malloc(buffer[i].len)); + memcpy(result, last_value_.ptr, prefix_len); + memcpy(result + prefix_len, suffix.ptr, suffix.len); + + buffer[i].ptr = result; + last_value_ = buffer[i]; + } + this->num_values_ -= max_values; + return max_values; + } + + private: + DeltaBitPackDecoder prefix_len_decoder_; + DeltaLengthByteArrayDecoder suffix_decoder_; + ByteArray last_value_; +}; + +// ---------------------------------------------------------------------- + +std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, + const ColumnDescriptor* descr) { + if (encoding == Encoding::PLAIN) { + switch (type_num) { + case Type::BOOLEAN: + return std::unique_ptr(new PlainBooleanDecoder(descr)); + case Type::INT32: + return std::unique_ptr(new PlainDecoder(descr)); + case Type::INT64: + return std::unique_ptr(new PlainDecoder(descr)); + case Type::INT96: + return std::unique_ptr(new PlainDecoder(descr)); + case Type::FLOAT: + return std::unique_ptr(new PlainDecoder(descr)); + case Type::DOUBLE: + return std::unique_ptr(new PlainDecoder(descr)); + case Type::BYTE_ARRAY: + return std::unique_ptr(new PlainByteArrayDecoder(descr)); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::unique_ptr(new PlainFLBADecoder(descr)); + default: + break; + } + } else { + ParquetException::NYI("Selected encoding is not supported"); + } + DCHECK(false) << "Should not be able to reach this code"; + return nullptr; +} + +namespace detail { + +std::unique_ptr MakeDictDecoder(Type::type type_num, + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) { + switch (type_num) { + case Type::BOOLEAN: + ParquetException::NYI("Dictionary encoding not implemented for boolean type"); + case Type::INT32: + return std::unique_ptr(new DictDecoderImpl(descr, pool)); + case Type::INT64: + return std::unique_ptr(new DictDecoderImpl(descr, pool)); + case Type::INT96: + return std::unique_ptr(new DictDecoderImpl(descr, pool)); + case Type::FLOAT: + return std::unique_ptr(new DictDecoderImpl(descr, pool)); + case Type::DOUBLE: + return std::unique_ptr(new DictDecoderImpl(descr, pool)); + case Type::BYTE_ARRAY: + return std::unique_ptr(new DictByteArrayDecoder(descr, pool)); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::unique_ptr(new DictFLBADecoder(descr, pool)); + default: + break; + } + DCHECK(false) << "Should not be able to reach this code"; + return nullptr; +} + +} // namespace detail + +} // namespace parquet diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 006f22f2d114a..046296cdb1445 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -15,50 +15,66 @@ // specific language governing permissions and limitations // under the License. -#ifndef PARQUET_ENCODING_H -#define PARQUET_ENCODING_H +#pragma once #include +#include #include -#include +#include -#include "arrow/status.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" #include "parquet/exception.h" -#include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/memory.h" +#include "parquet/util/visibility.h" + +namespace arrow { + +class BinaryDictionaryBuilder; + +namespace internal { + +class ChunkedBinaryBuilder; + +} // namespace internal +} // namespace arrow namespace parquet { class ColumnDescriptor; +// Untyped base for all encoders +class Encoder { + public: + virtual ~Encoder() = default; + + virtual int64_t EstimatedDataEncodedSize() = 0; + virtual std::shared_ptr FlushValues() = 0; + virtual Encoding::type encoding() const = 0; + + virtual ::arrow::MemoryPool* memory_pool() const = 0; +}; + // Base class for value encoders. Since encoders may or not have state (e.g., // dictionary encoding) we use a class instance to maintain any state. // // TODO(wesm): Encode interface API is temporary template -class Encoder { +class TypedEncoder : virtual public Encoder { public: typedef typename DType::c_type T; - virtual ~Encoder() {} - - virtual int64_t EstimatedDataEncodedSize() = 0; - virtual std::shared_ptr FlushValues() = 0; virtual void Put(const T* src, int num_values) = 0; + virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, int64_t valid_bits_offset) { std::shared_ptr buffer; - auto status = - ::arrow::AllocateResizableBuffer(pool_, num_values * sizeof(T), &buffer); - if (!status.ok()) { - std::ostringstream ss; - ss << "AllocateResizableBuffer failed in Encoder.PutSpaced in " << __FILE__ - << ", on line " << __LINE__; - throw ParquetException(ss.str()); - } + PARQUET_THROW_NOT_OK(::arrow::AllocateResizableBuffer( + this->memory_pool(), num_values * sizeof(T), &buffer)); int32_t num_valid_values = 0; ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, num_values); @@ -71,32 +87,53 @@ class Encoder { } Put(data, num_valid_values); } +}; + +// Base class for dictionary encoders +template +class DictEncoder : virtual public TypedEncoder { + public: + /// Writes out any buffered indices to buffer preceded by the bit width of this data. + /// Returns the number of bytes written. + /// If the supplied buffer is not big enough, returns -1. + /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize() + /// to size buffer. + virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0; + + virtual int dict_encoded_size() = 0; + // virtual int dict_encoded_size() { return dict_encoded_size_; } - Encoding::type encoding() const { return encoding_; } + virtual int bit_width() const = 0; - protected: - explicit Encoder(const ColumnDescriptor* descr, Encoding::type encoding, - ::arrow::MemoryPool* pool) - : descr_(descr), encoding_(encoding), pool_(pool) {} + /// Writes out the encoded dictionary to buffer. buffer must be preallocated to + /// dict_encoded_size() bytes. + virtual void WriteDict(uint8_t* buffer) = 0; - // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY - const ColumnDescriptor* descr_; - const Encoding::type encoding_; - ::arrow::MemoryPool* pool_; + virtual int num_entries() const = 0; }; -// The Decoder template is parameterized on parquet::DataType subclasses -template +// ---------------------------------------------------------------------- +// Value decoding + class Decoder { public: - typedef typename DType::c_type T; - - virtual ~Decoder() {} + virtual ~Decoder() = default; // Sets the data for a new page. This will be called multiple times on the same // decoder and should reset all internal state. virtual void SetData(int num_values, const uint8_t* data, int len) = 0; + // Returns the number of values left (for the last call to SetData()). This is + // the number of values left in this page. + virtual int values_left() const = 0; + virtual Encoding::type encoding() const = 0; +}; + +template +class TypedDecoder : virtual public Decoder { + public: + using T = typename DType::c_type; + // Subclasses should override the ones they support. In each of these functions, // the decoder would decode put to 'max_values', storing the result in 'buffer'. // The function returns the number of values decoded, which should be max_values @@ -117,7 +154,8 @@ class Decoder { // Depending on the number of nulls, some of the value slots in buffer may // be uninitialized, and this will cause valgrind warnings / potentially UB - memset(buffer + values_read, 0, (num_values - values_read) * sizeof(T)); + memset(static_cast(buffer + values_read), 0, + (num_values - values_read) * sizeof(T)); // Add spacing for null entries. As we have filled the buffer from the front, // we need to add the spacing from the back. @@ -129,24 +167,166 @@ class Decoder { } return num_values; } +}; - // Returns the number of values left (for the last call to SetData()). This is - // the number of values left in this page. - int values_left() const { return num_values_; } +template +class DictDecoder : virtual public TypedDecoder { + public: + virtual void SetDict(TypedDecoder* dictionary) = 0; +}; - Encoding::type encoding() const { return encoding_; } +// ---------------------------------------------------------------------- +// TypedEncoder specializations, traits, and factory functions - protected: - explicit Decoder(const ColumnDescriptor* descr, Encoding::type encoding) - : descr_(descr), encoding_(encoding), num_values_(0) {} +class BooleanEncoder : virtual public TypedEncoder { + public: + using TypedEncoder::Put; + virtual void Put(const std::vector& src, int num_values) = 0; +}; - // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY - const ColumnDescriptor* descr_; +using Int32Encoder = TypedEncoder; +using Int64Encoder = TypedEncoder; +using Int96Encoder = TypedEncoder; +using FloatEncoder = TypedEncoder; +using DoubleEncoder = TypedEncoder; +class ByteArrayEncoder : virtual public TypedEncoder {}; +class FLBAEncoder : virtual public TypedEncoder {}; - const Encoding::type encoding_; - int num_values_; +class BooleanDecoder : virtual public TypedDecoder { + public: + using TypedDecoder::Decode; + virtual int Decode(uint8_t* buffer, int max_values) = 0; }; -} // namespace parquet +using Int32Decoder = TypedDecoder; +using Int64Decoder = TypedDecoder; +using Int96Decoder = TypedDecoder; +using FloatDecoder = TypedDecoder; +using DoubleDecoder = TypedDecoder; + +class ByteArrayDecoder : virtual public TypedDecoder { + public: + using TypedDecoder::DecodeSpaced; + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; + + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* builder) = 0; + + // TODO(wesm): Implement DecodeArrowNonNull as part of ARROW-3325 + // See also ARROW-3772, ARROW-3769 + virtual int DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; +}; + +class FLBADecoder : virtual public TypedDecoder { + public: + using TypedDecoder::DecodeSpaced; + + // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if + // there is value in adding specialized read methods for + // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type + // then perhaps not +}; + +template +struct EncodingTraits {}; + +template <> +struct EncodingTraits { + using Encoder = BooleanEncoder; + using Decoder = BooleanDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = Int32Encoder; + using Decoder = Int32Decoder; +}; + +template <> +struct EncodingTraits { + using Encoder = Int64Encoder; + using Decoder = Int64Decoder; +}; + +template <> +struct EncodingTraits { + using Encoder = Int96Encoder; + using Decoder = Int96Decoder; +}; -#endif // PARQUET_ENCODING_H +template <> +struct EncodingTraits { + using Encoder = FloatEncoder; + using Decoder = FloatDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = DoubleEncoder; + using Decoder = DoubleDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = ByteArrayEncoder; + using Decoder = ByteArrayDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = FLBAEncoder; + using Decoder = FLBADecoder; +}; + +PARQUET_EXPORT +std::unique_ptr MakeEncoder( + Type::type type_num, Encoding::type encoding, bool use_dictionary = false, + const ColumnDescriptor* descr = NULLPTR, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + +template +std::unique_ptr::Encoder> MakeTypedEncoder( + Encoding::type encoding, bool use_dictionary = false, + const ColumnDescriptor* descr = NULLPTR, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + using OutType = typename EncodingTraits::Encoder; + std::unique_ptr base = + MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool); + return std::unique_ptr(dynamic_cast(base.release())); +} + +PARQUET_EXPORT +std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, + const ColumnDescriptor* descr = NULLPTR); + +namespace detail { + +PARQUET_EXPORT +std::unique_ptr MakeDictDecoder(Type::type type_num, + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool); + +} // namespace detail + +template +std::unique_ptr> MakeDictDecoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + using OutType = DictDecoder; + auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); + return std::unique_ptr(dynamic_cast(decoder.release())); +} + +template +std::unique_ptr::Decoder> MakeTypedDecoder( + Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) { + using OutType = typename EncodingTraits::Decoder; + std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr); + return std::unique_ptr(dynamic_cast(base.release())); +} + +} // namespace parquet diff --git a/cpp/src/parquet/file-deserialize-test.cc b/cpp/src/parquet/file-deserialize-test.cc index 17dfe387fd6e0..e62968e5d5dc9 100644 --- a/cpp/src/parquet/file-deserialize-test.cc +++ b/cpp/src/parquet/file-deserialize-test.cc @@ -22,6 +22,7 @@ #include #include "parquet/column_page.h" +#include "parquet/column_reader.h" #include "parquet/exception.h" #include "parquet/file_reader.h" #include "parquet/thrift.h" @@ -85,8 +86,8 @@ class TestPageSerde : public ::testing::Test { page_header_.compressed_page_size = compressed_size; page_header_.type = format::PageType::DATA_PAGE; - ASSERT_NO_THROW( - SerializeThriftMsg(&page_header_, max_serialized_len, out_stream_.get())); + ThriftSerializer serializer; + ASSERT_NO_THROW(serializer.Serialize(&page_header_, out_stream_.get())); } void ResetStream() { out_stream_.reset(new InMemoryOutputStream); } @@ -176,9 +177,11 @@ TEST_F(TestPageSerde, TestFailLargePageHeaders) { } TEST_F(TestPageSerde, Compression) { - Compression::type codec_types[5] = {Compression::GZIP, Compression::SNAPPY, - Compression::BROTLI, Compression::LZ4, - Compression::ZSTD}; + std::vector codec_types = {Compression::GZIP, Compression::SNAPPY, + Compression::BROTLI, Compression::LZ4}; +#ifdef ARROW_WITH_ZSTD + codec_types.push_back(Compression::ZSTD); +#endif const int32_t num_rows = 32; // dummy value data_page_header_.num_values = num_rows; diff --git a/cpp/src/parquet/file-serialize-test.cc b/cpp/src/parquet/file-serialize-test.cc index 750faa20e2454..88dd657603184 100644 --- a/cpp/src/parquet/file-serialize-test.cc +++ b/cpp/src/parquet/file-serialize-test.cc @@ -301,9 +301,11 @@ TYPED_TEST(TestSerialize, SmallFileLz4) { ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::LZ4)); } +#ifdef ARROW_WITH_ZSTD TYPED_TEST(TestSerialize, SmallFileZstd) { ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::ZSTD)); } +#endif } // namespace test diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 5be1a86234f3b..0f8e35904c606 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -19,23 +19,22 @@ #include #include -#include +#include #include -#include #include #include -#include +#include "arrow/buffer.h" #include "arrow/io/file.h" +#include "arrow/status.h" #include "arrow/util/logging.h" -#include "parquet/column_page.h" #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/exception.h" #include "parquet/metadata.h" #include "parquet/properties.h" -#include "parquet/thrift.h" +#include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/memory.h" diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index 4730305c93131..2d1cc9221f377 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -19,24 +19,24 @@ #define PARQUET_FILE_READER_H #include -#include -#include #include #include #include -#include "parquet/column_reader.h" -#include "parquet/metadata.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/macros.h" + +#include "parquet/metadata.h" // IWYU pragma:: keep #include "parquet/properties.h" -#include "parquet/schema.h" -#include "parquet/statistics.h" -#include "parquet/util/macros.h" -#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { class ColumnReader; +class FileMetaData; +class PageReader; +class RandomAccessSource; +class RowGroupMetaData; class PARQUET_EXPORT RowGroupReader { public: diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 01fa112fe37ef..51f0cb43b7eea 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -21,15 +21,12 @@ #include #include "parquet/column_writer.h" -#include "parquet/schema-internal.h" #include "parquet/schema.h" -#include "parquet/thrift.h" #include "parquet/util/memory.h" using arrow::MemoryPool; using parquet::schema::GroupNode; -using parquet::schema::SchemaFlattener; namespace parquet { @@ -251,6 +248,9 @@ class FileSerializer : public ParquetFileWriter::Contents { void Close() override { if (is_open_) { + // If any functions here raise an exception, we set is_open_ to be false + // so that this does not get called again (possibly causing segfault) + is_open_ = false; if (row_group_writer_) { num_rows_ += row_group_writer_->num_rows(); row_group_writer_->Close(); @@ -262,7 +262,6 @@ class FileSerializer : public ParquetFileWriter::Contents { WriteFileMetaData(*metadata, sink_.get()); sink_->Close(); - is_open_ = false; } } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 82703f82dc899..860500f3bfe14 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -20,25 +20,31 @@ #include #include +#include +#include "arrow/util/macros.h" + +#include "parquet/exception.h" #include "parquet/metadata.h" #include "parquet/properties.h" #include "parquet/schema.h" -#include "parquet/util/macros.h" -#include "parquet/util/memory.h" #include "parquet/util/visibility.h" -namespace parquet { +namespace arrow { + +class MemoryPool; + +namespace io { -class ColumnWriter; -class PageWriter; class OutputStream; -namespace schema { +} // namespace io +} // namespace arrow -class GroupNode; +namespace parquet { -} // namespace schema +class ColumnWriter; +class OutputStream; class PARQUET_EXPORT RowGroupWriter { public: diff --git a/cpp/src/parquet/hasher.h b/cpp/src/parquet/hasher.h index dc316a0377cba..233262ebdd647 100644 --- a/cpp/src/parquet/hasher.h +++ b/cpp/src/parquet/hasher.h @@ -63,8 +63,8 @@ class Hasher { /// Compute hash for fixed byte array value by using its plain encoding result. /// - /// @param value the value to hash. - /// @return hash result. + /// @param value the value address. + /// @param len the value length. virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; virtual ~Hasher() = default; diff --git a/cpp/src/parquet/metadata-test.cc b/cpp/src/parquet/metadata-test.cc index bcf911eab8b26..826ac4d6a504f 100644 --- a/cpp/src/parquet/metadata-test.cc +++ b/cpp/src/parquet/metadata-test.cc @@ -59,7 +59,6 @@ TEST(Metadata, TestBuildAccess) { auto f_builder = FileMetaDataBuilder::Make(&schema, props); auto rg1_builder = f_builder->AppendRowGroup(); - auto rg2_builder = f_builder->AppendRowGroup(); // Write the metadata // rowgroup1 metadata @@ -75,6 +74,7 @@ TEST(Metadata, TestBuildAccess) { rg1_builder->Finish(1024); // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); col1_builder = rg2_builder->NextColumnChunk(); col2_builder = rg2_builder->NextColumnChunk(); // column metadata diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index cf63b0f662b52..93c2073e898ba 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -16,22 +16,25 @@ // under the License. #include +#include #include #include -#include + +#include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/metadata.h" #include "parquet/schema-internal.h" #include "parquet/schema.h" +#include "parquet/statistics.h" #include "parquet/thrift.h" -#include "parquet/util/memory.h" -#include -#include +#include // IWYU pragma: keep namespace parquet { +class OutputStream; + const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() { static ApplicationVersion version("parquet-mr", 1, 8, 0); return version; @@ -47,6 +50,23 @@ const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() return version; } +const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() { + static ApplicationVersion version("parquet-mr", 1, 10, 0); + return version; +} + +std::string ParquetVersionToString(ParquetVersion::type ver) { + switch (ver) { + case ParquetVersion::PARQUET_1_0: + return "1.0"; + case ParquetVersion::PARQUET_2_0: + return "2.0"; + } + + // This should be unreachable + return "UNKNOWN"; +} + template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { @@ -103,7 +123,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } possible_stats_ = nullptr; } - ~ColumnChunkMetaDataImpl() {} // column chunk inline int64_t file_offset() const { return column_->file_offset; } @@ -185,13 +204,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const uint8_t* metadata, const ColumnDescriptor* descr, + const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, writer_version)); } -ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata, +ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( @@ -260,7 +279,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} - ~RowGroupMetaDataImpl() {} inline int num_columns() const { return static_cast(row_group_->columns.size()); } @@ -277,9 +295,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make( - reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), - writer_version_); + return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), + writer_version_); } private: @@ -289,14 +306,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { }; std::unique_ptr RowGroupMetaData::Make( - const uint8_t* metadata, const SchemaDescriptor* schema, + const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) { return std::unique_ptr( new RowGroupMetaData(metadata, schema, writer_version)); } -RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata, - const SchemaDescriptor* schema, +RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : impl_{std::unique_ptr(new RowGroupMetaDataImpl( reinterpret_cast(metadata), schema, writer_version))} { @@ -320,10 +336,11 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); - DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, + metadata_.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -336,7 +353,6 @@ class FileMetaData::FileMetaDataImpl { InitColumnOrders(); InitKeyValueMetadata(); } - ~FileMetaDataImpl() {} inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } @@ -353,7 +369,8 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(OutputStream* dst) const { - SerializeThriftMsg(metadata_.get(), 1024, dst); + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); } std::unique_ptr RowGroup(int i) { @@ -363,9 +380,7 @@ class FileMetaData::FileMetaDataImpl { << " row groups, requested metadata for row group: " << i; throw ParquetException(ss.str()); } - return RowGroupMetaData::Make( - reinterpret_cast(&metadata_->row_groups[i]), &schema_, - &writer_version_); + return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_); } const SchemaDescriptor* schema() const { return &schema_; } @@ -417,13 +432,13 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const uint8_t* metadata, +std::shared_ptr FileMetaData::Make(const void* metadata, uint32_t* metadata_len) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr(new FileMetaData(metadata, metadata_len)); } -FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len))} {} @@ -540,8 +555,10 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, EncodedStatistics& statistics, SortOrder::type sort_order) const { - // Parquet cpp version 1.3.0 onwards stats are computed correctly for all types - if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) { + // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed + // correctly for all types + if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) || + (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) { // Only SIGNED are valid unless max and min are the same // (in which case the sort order does not matter) bool max_equals_min = statistics.has_min && statistics.has_max @@ -594,11 +611,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { Init(column_chunk); } - ~ColumnChunkMetaDataBuilderImpl() {} - - const uint8_t* contents() const { - return reinterpret_cast(column_chunk_); - } + const void* contents() const { return column_chunk_; } // column chunk void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); } @@ -665,7 +678,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(OutputStream* sink) { - SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); + ThriftSerializer serializer; + serializer.Serialize(column_chunk_, sink); } const ColumnDescriptor* descr() const { return column_; } @@ -687,7 +701,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::unique_ptr ColumnChunkMetaDataBuilder::Make( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents) { + void* contents) { return std::unique_ptr( new ColumnChunkMetaDataBuilder(props, column, contents)); } @@ -705,14 +719,14 @@ ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents) + void* contents) : impl_{std::unique_ptr( new ColumnChunkMetaDataBuilderImpl( props, column, reinterpret_cast(contents)))} {} ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {} -const uint8_t* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); } +const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); } void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); @@ -742,12 +756,11 @@ void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, - const SchemaDescriptor* schema, uint8_t* contents) + const SchemaDescriptor* schema, void* contents) : properties_(props), schema_(schema), current_column_(0) { row_group_ = reinterpret_cast(contents); InitializeColumns(schema->num_columns()); } - ~RowGroupMetaDataBuilderImpl() {} ColumnChunkMetaDataBuilder* NextColumnChunk() { if (!(current_column_ < num_columns())) { @@ -758,8 +771,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { } auto column = schema_->Column(current_column_); auto column_builder = ColumnChunkMetaDataBuilder::Make( - properties_, column, - reinterpret_cast(&row_group_->columns[current_column_++])); + properties_, column, &row_group_->columns[current_column_++]); auto column_builder_ptr = column_builder.get(); column_builders_.push_back(std::move(column_builder)); return column_builder_ptr; @@ -808,14 +820,14 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { std::unique_ptr RowGroupMetaDataBuilder::Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents) { + void* contents) { return std::unique_ptr( new RowGroupMetaDataBuilder(props, schema_, contents)); } RowGroupMetaDataBuilder::RowGroupMetaDataBuilder( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents) + void* contents) : impl_{std::unique_ptr( new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} @@ -849,29 +861,21 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); } - ~FileMetaDataBuilderImpl() {} RowGroupMetaDataBuilder* AppendRowGroup() { - auto row_group = std::unique_ptr(new format::RowGroup()); - auto row_group_builder = RowGroupMetaDataBuilder::Make( - properties_, schema_, reinterpret_cast(row_group.get())); - RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get(); - row_group_builders_.push_back(std::move(row_group_builder)); - row_groups_.push_back(std::move(row_group)); - return row_group_ptr; + row_groups_.emplace_back(); + current_row_group_builder_ = + RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back()); + return current_row_group_builder_.get(); } std::unique_ptr Finish() { int64_t total_rows = 0; - std::vector row_groups; - for (auto row_group = row_groups_.begin(); row_group != row_groups_.end(); - row_group++) { - auto rowgroup = *((*row_group).get()); - row_groups.push_back(rowgroup); - total_rows += rowgroup.num_rows; + for (auto row_group : row_groups_) { + total_rows += row_group.num_rows; } metadata_->__set_num_rows(total_rows); - metadata_->__set_row_groups(row_groups); + metadata_->__set_row_groups(row_groups_); if (key_value_metadata_) { metadata_->key_value_metadata.clear(); @@ -926,8 +930,9 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { private: const std::shared_ptr properties_; - std::vector> row_groups_; - std::vector> row_group_builders_; + std::vector row_groups_; + + std::unique_ptr current_row_group_builder_; const SchemaDescriptor* schema_; std::shared_ptr key_value_metadata_; }; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 706e980711683..4ccf14be1fdd5 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -18,23 +18,32 @@ #ifndef PARQUET_FILE_METADATA_H #define PARQUET_FILE_METADATA_H +#include #include -#include #include #include #include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" #include "parquet/properties.h" -#include "parquet/schema.h" -#include "parquet/statistics.h" #include "parquet/types.h" -#include "parquet/util/macros.h" -#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { +class ColumnDescriptor; +class EncodedStatistics; +class OutputStream; +class RowGroupStatistics; +class SchemaDescriptor; + +namespace schema { + +class ColumnPath; + +} // namespace schema + using KeyValueMetadata = ::arrow::KeyValueMetadata; class PARQUET_EXPORT ApplicationVersion { @@ -43,6 +52,7 @@ class PARQUET_EXPORT ApplicationVersion { static const ApplicationVersion& PARQUET_251_FIXED_VERSION(); static const ApplicationVersion& PARQUET_816_FIXED_VERSION(); static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION(); + static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION(); // Regular expression for the version format // major . minor . patch unknown - prerelease.x + build info // Eg: 1.5.0ab-cdh5.5.0+cd @@ -93,7 +103,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const uint8_t* metadata, const ColumnDescriptor* descr, + const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR); ~ColumnChunkMetaData(); @@ -119,7 +129,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_uncompressed_size() const; private: - explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, + explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -130,7 +140,7 @@ class PARQUET_EXPORT RowGroupMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const uint8_t* metadata, const SchemaDescriptor* schema, + const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULLPTR); ~RowGroupMetaData(); @@ -144,7 +154,7 @@ class PARQUET_EXPORT RowGroupMetaData { std::unique_ptr ColumnChunk(int i) const; private: - explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, + explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class RowGroupMetaDataImpl; @@ -156,7 +166,7 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const uint8_t* serialized_metadata, + static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len); ~FileMetaData(); @@ -182,7 +192,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); // PIMPL Idiom FileMetaData(); @@ -199,7 +209,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { static std::unique_ptr Make( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents); + void* contents); ~ColumnChunkMetaDataBuilder(); @@ -217,7 +227,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { bool dictionary_fallback); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make - const uint8_t* contents() const; + const void* contents() const; // For writing metadata at end of column chunk void WriteTo(OutputStream* sink); @@ -226,7 +236,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, - const ColumnDescriptor* column, uint8_t* contents); + const ColumnDescriptor* column, void* contents); // PIMPL Idiom class ColumnChunkMetaDataBuilderImpl; std::unique_ptr impl_; @@ -237,7 +247,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { // API convenience to get a MetaData reader static std::unique_ptr Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents); + void* contents); ~RowGroupMetaDataBuilder(); @@ -253,7 +263,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, - const SchemaDescriptor* schema_, uint8_t* contents); + const SchemaDescriptor* schema_, void* contents); // PIMPL Idiom class RowGroupMetaDataBuilderImpl; std::unique_ptr impl_; @@ -268,9 +278,10 @@ class PARQUET_EXPORT FileMetaDataBuilder { ~FileMetaDataBuilder(); + // The prior RowGroupMetaDataBuilder (if any) is destroyed RowGroupMetaDataBuilder* AppendRowGroup(); - // commit the metadata + // Complete the Thrift structure std::unique_ptr Finish(); private: @@ -282,6 +293,8 @@ class PARQUET_EXPORT FileMetaDataBuilder { std::unique_ptr impl_; }; +PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); + } // namespace parquet #endif // PARQUET_FILE_METADATA_H diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 9f26a4180cda1..61d669bcb34d6 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -17,15 +17,30 @@ #include "parquet/printer.h" +#include +#include +#include +#include #include #include +#include "arrow/util/key_value_metadata.h" + #include "parquet/column_scanner.h" +#include "parquet/exception.h" +#include "parquet/file_reader.h" +#include "parquet/metadata.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/types.h" using std::string; using std::vector; namespace parquet { + +class ColumnReader; + // ---------------------------------------------------------------------- // ParquetFilePrinter::DebugPrint @@ -38,7 +53,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte const FileMetaData* file_metadata = fileReader->metadata().get(); stream << "File Name: " << filename << "\n"; - stream << "Version: " << file_metadata->version() << "\n"; + stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n"; stream << "Created By: " << file_metadata->created_by() << "\n"; stream << "Total rows: " << file_metadata->num_rows() << "\n"; diff --git a/cpp/src/parquet/printer.h b/cpp/src/parquet/printer.h index 1113c3fecd25b..4591e7abad058 100644 --- a/cpp/src/parquet/printer.h +++ b/cpp/src/parquet/printer.h @@ -18,17 +18,15 @@ #ifndef PARQUET_FILE_PRINTER_H #define PARQUET_FILE_PRINTER_H -#include #include #include -#include -#include -#include -#include "parquet/file_reader.h" +#include "parquet/util/visibility.h" namespace parquet { +class ParquetFileReader; + class PARQUET_EXPORT ParquetFilePrinter { private: ParquetFileReader* fileReader; diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc index d628f4727c160..a0536b56a89ca 100644 --- a/cpp/src/parquet/reader-test.cc +++ b/cpp/src/parquet/reader-test.cc @@ -28,6 +28,7 @@ #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/file_reader.h" +#include "parquet/metadata.h" #include "parquet/printer.h" #include "parquet/util/memory.h" #include "parquet/util/test-common.h" diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index da004344f2016..431f30773b96d 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -19,11 +19,13 @@ #include "parquet/schema-internal.h" #include +#include #include -#include #include #include +#include "arrow/util/logging.h" + #include "parquet/exception.h" #include "parquet/thrift.h" diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index add2f6dbab013..76920c0e93b57 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -28,6 +28,8 @@ #include #include +#include "arrow/util/macros.h" + #include "parquet/types.h" #include "parquet/util/macros.h" #include "parquet/util/visibility.h" @@ -144,9 +146,7 @@ class PARQUET_EXPORT Node { const std::shared_ptr path() const; - // ToParquet returns an opaque void* to avoid exporting - // parquet::SchemaElement into the public API - virtual void ToParquet(void* opaque_element) const = 0; + virtual void ToParquet(void* element) const = 0; // Node::Visitor abstract class for walking schemas with the visitor pattern class Visitor { @@ -193,8 +193,6 @@ typedef std::vector NodeVector; // parameters) class PARQUET_EXPORT PrimitiveNode : public Node { public: - // FromParquet accepts an opaque void* to avoid exporting - // parquet::SchemaElement into the public API static std::unique_ptr FromParquet(const void* opaque_element, int id); static inline NodePtr Make(const std::string& name, Repetition::type repetition, @@ -217,7 +215,7 @@ class PARQUET_EXPORT PrimitiveNode : public Node { const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } - void ToParquet(void* opaque_element) const override; + void ToParquet(void* element) const override; void Visit(Visitor* visitor) override; void VisitConst(ConstVisitor* visitor) const override; @@ -250,8 +248,6 @@ class PARQUET_EXPORT PrimitiveNode : public Node { class PARQUET_EXPORT GroupNode : public Node { public: - // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting - // parquet::SchemaElement into the public API static std::unique_ptr FromParquet(const void* opaque_element, int id, const NodeVector& fields); @@ -273,7 +269,7 @@ class PARQUET_EXPORT GroupNode : public Node { int field_count() const { return static_cast(fields_.size()); } - void ToParquet(void* opaque_element) const override; + void ToParquet(void* element) const override; void Visit(Visitor* visitor) override; void VisitConst(ConstVisitor* visitor) const override; diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc index e1926a36b684c..ecdbaeb78d83d 100644 --- a/cpp/src/parquet/statistics-test.cc +++ b/cpp/src/parquet/statistics-test.cc @@ -772,5 +772,33 @@ TEST(TestStatisticsDoubleNaN, NaNValues) { ASSERT_EQ(min, -3.0); ASSERT_EQ(max, 4.0); } + +// Test statistics for binary column with UNSIGNED sort order +TEST(TestStatisticsMinMax, Unsigned) { + std::string dir_string(test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/binary.parquet"; + auto path = ss.str(); + + // The file is generated by parquet-mr 1.10.0, the first version that + // supports correct statistics for binary data (see PARQUET-1025). It + // contains a single column of binary type. Data is just single byte values + // from 0x00 to 0x0B. + auto file_reader = ParquetFileReader::OpenFile(path); + auto rg_reader = file_reader->RowGroup(0); + auto metadata = rg_reader->metadata(); + auto column_schema = metadata->schema()->Column(0); + ASSERT_EQ(SortOrder::UNSIGNED, column_schema->sort_order()); + + auto column_chunk = metadata->ColumnChunk(0); + ASSERT_TRUE(column_chunk->is_stats_set()); + + std::shared_ptr stats = column_chunk->statistics(); + ASSERT_TRUE(stats != NULL); + ASSERT_EQ(0, stats->null_count()); + ASSERT_EQ(12, stats->num_values()); + ASSERT_EQ(0x00, stats->EncodeMin()[0]); + ASSERT_EQ(0x0b, stats->EncodeMax()[0]); +} } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index ed4e8d05592e4..4cb2bfd92131d 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -16,10 +16,13 @@ // under the License. #include +#include #include #include -#include "parquet/encoding-internal.h" +#include "arrow/util/logging.h" + +#include "parquet/encoding.h" #include "parquet/exception.h" #include "parquet/statistics.h" #include "parquet/util/memory.h" @@ -296,19 +299,19 @@ EncodedStatistics TypedRowGroupStatistics::Encode() { template void TypedRowGroupStatistics::PlainEncode(const T& src, std::string* dst) { - PlainEncoder encoder(descr(), pool_); - encoder.Put(&src, 1); - auto buffer = encoder.FlushValues(); + auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, descr(), pool_); + encoder->Put(&src, 1); + auto buffer = encoder->FlushValues(); auto ptr = reinterpret_cast(buffer->data()); dst->assign(ptr, buffer->size()); } template void TypedRowGroupStatistics::PlainDecode(const std::string& src, T* dst) { - PlainDecoder decoder(descr()); - decoder.SetData(1, reinterpret_cast(src.c_str()), - static_cast(src.size())); - decoder.Decode(dst, 1); + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr()); + decoder->SetData(1, reinterpret_cast(src.c_str()), + static_cast(src.size())); + decoder->Decode(dst, 1); } template <> diff --git a/cpp/src/parquet/test-util.h b/cpp/src/parquet/test-util.h index ab9c50a392862..ed7c7bb901621 100644 --- a/cpp/src/parquet/test-util.h +++ b/cpp/src/parquet/test-util.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,7 @@ #include "parquet/column_page.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" -#include "parquet/encoding-internal.h" +#include "parquet/encoding.h" #include "parquet/util/memory.h" #include "parquet/util/test-common.h" @@ -49,6 +50,15 @@ bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) { namespace test { +template +std::shared_ptr EncodeValues(Encoding::type encoding, bool use_dictionary, + const Sequence& values, int length, + const ColumnDescriptor* descr) { + auto encoder = MakeTypedEncoder(encoding, use_dictionary, descr); + encoder->Put(values, length); + return encoder->FlushValues(); +} + template static void InitValues(int num_values, vector& values, vector& buffer) { random_numbers(num_values, 0, std::numeric_limits::min(), @@ -132,9 +142,8 @@ class DataPageBuilder { void AppendValues(const ColumnDescriptor* d, const vector& values, Encoding::type encoding = Encoding::PLAIN) { - PlainEncoder encoder(d); - encoder.Put(&values[0], static_cast(values.size())); - std::shared_ptr values_sink = encoder.FlushValues(); + std::shared_ptr values_sink = EncodeValues( + encoding, false, values.data(), static_cast(values.size()), d); sink_->Write(values_sink->data(), values_sink->size()); num_values_ = std::max(static_cast(values.size()), num_values_); @@ -194,9 +203,11 @@ void DataPageBuilder::AppendValues(const ColumnDescriptor* d, if (encoding != Encoding::PLAIN) { ParquetException::NYI("only plain encoding currently implemented"); } - PlainEncoder encoder(d); - encoder.Put(values, static_cast(values.size())); - std::shared_ptr buffer = encoder.FlushValues(); + + auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, d); + dynamic_cast(encoder.get()) + ->Put(values, static_cast(values.size())); + std::shared_ptr buffer = encoder->FlushValues(); sink_->Write(buffer->data(), buffer->size()); num_values_ = std::max(static_cast(values.size()), num_values_); @@ -242,11 +253,14 @@ class DictionaryPageBuilder { public: typedef typename TYPE::c_type TC; static constexpr int TN = TYPE::type_num; + using SpecializedEncoder = typename EncodingTraits::Encoder; // This class writes data and metadata to the passed inputs explicit DictionaryPageBuilder(const ColumnDescriptor* d) : num_dict_values_(0), have_values_(false) { - encoder_.reset(new DictEncoder(d)); + auto encoder = MakeTypedEncoder(Encoding::PLAIN, true, d); + dict_traits_ = dynamic_cast*>(encoder.get()); + encoder_.reset(dynamic_cast(encoder.release())); } ~DictionaryPageBuilder() {} @@ -255,22 +269,23 @@ class DictionaryPageBuilder { int num_values = static_cast(values.size()); // Dictionary encoding encoder_->Put(values.data(), num_values); - num_dict_values_ = encoder_->num_entries(); + num_dict_values_ = dict_traits_->num_entries(); have_values_ = true; return encoder_->FlushValues(); } shared_ptr WriteDict() { - std::shared_ptr dict_buffer = - AllocateBuffer(::arrow::default_memory_pool(), encoder_->dict_encoded_size()); - encoder_->WriteDict(dict_buffer->mutable_data()); + std::shared_ptr dict_buffer = + AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size()); + dict_traits_->WriteDict(dict_buffer->mutable_data()); return dict_buffer; } int32_t num_values() const { return num_dict_values_; } private: - shared_ptr> encoder_; + DictEncoder* dict_traits_; + std::unique_ptr encoder_; int32_t num_dict_values_; bool have_values_; }; diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 9c665acfac4ff..1afd9bf436550 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef PARQUET_THRIFT_UTIL_H -#define PARQUET_THRIFT_UTIL_H +#pragma once #include "arrow/util/windows_compatibility.h" @@ -28,6 +27,7 @@ #else #include #endif +#include // TCompactProtocol requires some #defines to work right. #define SIGNED_RIGHT_SHIFT_IS 1 @@ -105,18 +105,18 @@ static inline format::CompressionCodec::type ToThrift(Compression::type type) { // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities +using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; + // Deserialize a thrift message from buf/len. buf/len must at least contain // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // tproto_factory.getProtocol(tmem_transport); try { deserialized_msg->read(tproto.get()); @@ -129,34 +129,57 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali *len = *len - bytes_left; } -// Serialize obj into a buffer. The result is returned as a string. -// The arguments are the object to be serialized and -// the expected size of the serialized object -template -inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { - shared_ptr mem_buffer( - new apache::thrift::transport::TMemoryBuffer(len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = - tproto_factory.getProtocol(mem_buffer); - try { - mem_buffer->resetBuffer(); - obj->write(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't serialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); +/// Utility class to serialize thrift objects to a binary format. This object +/// should be reused if possible to reuse the underlying memory. +/// Note: thrift will encode NULLs into the serialized buffer so it is not valid +/// to treat it as a string. +class ThriftSerializer { + public: + explicit ThriftSerializer(int initial_buffer_size = 1024) + : mem_buffer_(new ThriftBuffer(initial_buffer_size)) { + apache::thrift::protocol::TCompactProtocolFactoryT factory; + protocol_ = factory.getProtocol(mem_buffer_); } - uint8_t* out_buffer; - uint32_t out_length; - mem_buffer->getBuffer(&out_buffer, &out_length); - out->Write(out_buffer, out_length); - return out_length; -} + /// Serialize obj into a memory buffer. The result is returned in buffer/len. The + /// memory returned is owned by this object and will be invalid when another object + /// is serialized. + template + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { + SerializeObject(obj); + mem_buffer_->getBuffer(buffer, len); + } -} // namespace parquet + template + void SerializeToString(const T* obj, std::string* result) { + SerializeObject(obj); + *result = mem_buffer_->getBufferAsString(); + } + + template + int64_t Serialize(const T* obj, OutputStream* out) { + uint8_t* out_buffer; + uint32_t out_length; + SerializeToBuffer(obj, &out_length, &out_buffer); + out->Write(out_buffer, out_length); + return static_cast(out_length); + } -#endif // PARQUET_THRIFT_UTIL_H + private: + template + void SerializeObject(const T* obj) { + try { + mem_buffer_->resetBuffer(); + obj->write(protocol_.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't serialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + + shared_ptr mem_buffer_; + shared_ptr protocol_; +}; + +} // namespace parquet diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index b27718027b0da..2bc51e7dc7902 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -160,7 +160,8 @@ struct ByteArray { }; inline bool operator==(const ByteArray& left, const ByteArray& right) { - return left.len == right.len && 0 == std::memcmp(left.ptr, right.ptr, left.len); + return left.len == right.len && + (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); } inline bool operator!=(const ByteArray& left, const ByteArray& right) { @@ -175,6 +176,19 @@ struct FixedLenByteArray { using FLBA = FixedLenByteArray; +// Julian day at unix epoch. +// +// The Julian Day Number (JDN) is the integer assigned to a whole solar day in +// the Julian day count starting from noon Universal time, with Julian day +// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, +// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian +// calendar), +constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); +constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); +constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); +constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); +constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); + MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; STRUCT_END(Int96, 12); @@ -192,6 +206,14 @@ static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); } +static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { + int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; + int64_t nanoseconds = 0; + + memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); + return days_since_epoch * kNanosecondsPerDay + nanoseconds; +} + static inline std::string Int96ToString(const Int96& a) { std::ostringstream result; std::copy(a.value, a.value + 3, std::ostream_iterator(result, " ")); diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt index 72d4ca28f9b83..b5718b1601ee0 100644 --- a/cpp/src/parquet/util/CMakeLists.txt +++ b/cpp/src/parquet/util/CMakeLists.txt @@ -16,12 +16,7 @@ # under the License. # Headers: util -install(FILES - comparison.h - macros.h - memory.h - visibility.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util") +ARROW_INSTALL_ALL_HEADERS("parquet/util") ADD_PARQUET_TEST(comparison-test) ADD_PARQUET_TEST(memory-test) diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc index 6251f1c85c085..b3f83bdfdfd32 100644 --- a/cpp/src/parquet/util/memory.cc +++ b/cpp/src/parquet/util/memory.cc @@ -233,8 +233,11 @@ void InMemoryOutputStream::Write(const uint8_t* data, int64_t length) { PARQUET_THROW_NOT_OK(buffer_->Resize(new_capacity)); capacity_ = new_capacity; } - memcpy(Head(), data, length); - size_ += length; + // If length == 0, data may be null + if (length > 0) { + memcpy(Head(), data, length); + size_ += length; + } } int64_t InMemoryOutputStream::Tell() { return size_; } diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h index 8677e6b9dacbc..d63ed84dd7ead 100644 --- a/cpp/src/parquet/util/memory.h +++ b/cpp/src/parquet/util/memory.h @@ -66,6 +66,7 @@ class PARQUET_EXPORT Vector { void Swap(Vector& v); inline T& operator[](int64_t i) const { return data_[i]; } + T* data() { return data_; } const T* data() const { return data_; } private: diff --git a/cpp/src/parquet/util/visibility.h b/cpp/src/parquet/util/visibility.h index 929d3b22c8851..d731bad6ae47f 100644 --- a/cpp/src/parquet/util/visibility.h +++ b/cpp/src/parquet/util/visibility.h @@ -19,7 +19,8 @@ #define PARQUET_UTIL_VISIBILITY_H #if defined(_WIN32) || defined(__CYGWIN__) -#ifdef _MSC_VER + +#if defined(_MSC_VER) #pragma warning(push) // Disable warning for STL types usage in DLL interface // https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports @@ -30,9 +31,20 @@ #pragma warning(disable : 4005) // Disable extern before exported template warnings #pragma warning(disable : 4910) +#else +#pragma GCC diagnostic ignored "-Wattributes" #endif + +#ifdef PARQUET_STATIC +#define PARQUET_EXPORT +#elif defined(PARQUET_EXPORTING) #define PARQUET_EXPORT __declspec(dllexport) +#else +#define PARQUET_EXPORT __declspec(dllimport) +#endif + #define PARQUET_NO_EXPORT + #else // Not Windows #ifndef PARQUET_EXPORT #define PARQUET_EXPORT __attribute__((visibility("default"))) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 0f8916e6c48aa..566066463fffe 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -15,31 +15,30 @@ # specific language governing permissions and limitations # under the License. -cmake_minimum_required(VERSION 3.2) +add_custom_target(plasma-all) +add_custom_target(plasma) +add_custom_target(plasma-benchmarks) +add_custom_target(plasma-tests) +add_dependencies(plasma-all plasma plasma-tests plasma-benchmarks) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") +set(PLASMA_VERSION "${ARROW_VERSION}") -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../python/cmake_modules") - -find_package(PythonLibsNew REQUIRED) find_package(Threads) # The SO version is also the ABI version set(PLASMA_SO_VERSION "${ARROW_SO_VERSION}") set(PLASMA_FULL_SO_VERSION "${ARROW_FULL_SO_VERSION}") -include_directories(SYSTEM ${PYTHON_INCLUDE_DIRS}) include_directories("${FLATBUFFERS_INCLUDE_DIR}" "${CMAKE_CURRENT_LIST_DIR}/" "${CMAKE_CURRENT_LIST_DIR}/thirdparty/" "${CMAKE_CURRENT_LIST_DIR}/../") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-conversion") # Compile flatbuffers set(PLASMA_FBS_SRC "${CMAKE_CURRENT_LIST_DIR}/format/plasma.fbs" "${CMAKE_CURRENT_LIST_DIR}/format/common.fbs") -set(OUTPUT_DIR ${CMAKE_BINARY_DIR}/src/plasma) +set(OUTPUT_DIR ${ARROW_BINARY_DIR}/src/plasma) set(PLASMA_FBS_OUTPUT_FILES "${OUTPUT_DIR}/common_generated.h" @@ -96,6 +95,8 @@ ADD_ARROW_LIB(plasma SHARED_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_LINK_LIBS} STATIC_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_STATIC_LINK_LIBS}) +add_dependencies(plasma ${PLASMA_LIBRARIES}) + foreach(LIB_TARGET ${PLASMA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) @@ -127,6 +128,7 @@ endif() # be copied around and used in different locations. add_executable(plasma_store_server store.cc) target_link_libraries(plasma_store_server plasma_static ${PLASMA_STATIC_LINK_LIBS}) +add_dependencies(plasma plasma_store_server) if (ARROW_RPATH_ORIGIN) if (APPLE) @@ -138,7 +140,6 @@ if (ARROW_RPATH_ORIGIN) INSTALL_RPATH ${_lib_install_rpath}) endif() -# Headers: top level install(FILES common.h compat.h @@ -149,15 +150,12 @@ install(FILES # Plasma store set_target_properties(plasma_store_server PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) -install(TARGETS plasma_store_server DESTINATION ${CMAKE_INSTALL_BINDIR}) +install(TARGETS plasma_store_server + ${INSTALL_IS_OPTIONAL} + DESTINATION ${CMAKE_INSTALL_BINDIR}) # pkg-config support -configure_file(plasma.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("plasma") if(ARROW_PLASMA_JAVA_CLIENT) # Plasma java client support @@ -198,8 +196,20 @@ endif() # Unit tests ####################################### -ADD_ARROW_TEST(test/serialization_tests +# Adding unit tests part of the "arrow" portion of the test suite +function(ADD_PLASMA_TEST REL_TEST_NAME) + set(options) + set(one_value_args) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + ADD_TEST_CASE(${REL_TEST_NAME} + PREFIX "plasma" + LABELS "plasma-tests" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +ADD_PLASMA_TEST(test/serialization_tests EXTRA_LINK_LIBS plasma_shared ${PLASMA_LINK_LIBS}) -ADD_ARROW_TEST(test/client_tests +ADD_PLASMA_TEST(test/client_tests EXTRA_LINK_LIBS plasma_shared ${PLASMA_LINK_LIBS} EXTRA_DEPENDENCIES plasma_store_server) diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 99cf00cab80fd..f08d6efd71ee7 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -64,7 +64,7 @@ using arrow::cuda::CudaDeviceManager; #define XXH_INLINE_ALL 1 #define XXH_NAMESPACE plasma_client_ -#include "arrow/util/xxhash/xxhash.h" +#include "arrow/vendored/xxhash/xxhash.h" #define XXH64_DEFAULT_SEED 0 @@ -83,9 +83,6 @@ typedef struct XXH64_state_s XXH64_state_t; constexpr int64_t kHashingConcurrency = 8; constexpr int64_t kBytesInMB = 1 << 20; -// Use 100MB as an overestimate of the L3 cache size. -constexpr int64_t kL3CacheSizeBytes = 100000000; - // ---------------------------------------------------------------------- // GPU support @@ -143,22 +140,13 @@ struct ObjectInUseEntry { bool is_sealed; }; -/// Configuration options for the plasma client. -struct PlasmaClientConfig { - /// Number of release calls we wait until the object is actually released. - /// This allows us to avoid invalidating the cpu cache on workers if objects - /// are reused accross tasks. - size_t release_delay; -}; - struct ClientMmapTableEntry { + /// The associated file descriptor on the client. + int fd; /// The result of mmap for this file descriptor. uint8_t* pointer; /// The length of the memory-mapped file. size_t length; - /// The number of objects in this memory-mapped file that are currently being - /// used by the client. When this count reaches zeros, we unmap the file. - int count; }; class PlasmaClient::Impl : public std::enable_shared_from_this { @@ -169,7 +157,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this> objects_in_use_; - /// Object IDs of the last few release calls. This is a deque and - /// is used to delay releasing objects to see if they can be reused by - /// subsequent tasks so we do not unneccessarily invalidate cpu caches. - /// TODO(pcm): replace this with a proper lru cache using the size of the L3 - /// cache. - std::deque release_history_; - /// The number of bytes in the combined objects that are held in the release - /// history doubly-linked list. If this is too large then the client starts - /// releasing objects. - int64_t in_use_object_bytes_; - /// Configuration options for the plasma client. - PlasmaClientConfig config_; /// The amount of memory available to the Plasma store. The client needs this /// information to make sure that it does not delay in releasing so much /// memory that the store is unable to evict enough objects to free up space. @@ -294,7 +261,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_thisRelease(object_id_)); } -PlasmaClient::Impl::Impl() { +PlasmaClient::Impl::Impl() : store_conn_(0), store_capacity_(0) { #ifdef PLASMA_CUDA DCHECK_OK(CudaDeviceManager::GetInstance(&manager_)); #endif @@ -308,7 +275,6 @@ PlasmaClient::Impl::~Impl() {} uint8_t* PlasmaClient::Impl::LookupOrMmap(int fd, int store_fd_val, int64_t map_size) { auto entry = mmap_table_.find(store_fd_val); if (entry != mmap_table_.end()) { - close(fd); return entry->second.pointer; } else { // We subtract kMmapRegionsGap from the length that was added @@ -322,9 +288,9 @@ uint8_t* PlasmaClient::Impl::LookupOrMmap(int fd, int store_fd_val, int64_t map_ close(fd); // Closing this fd has an effect on performance. ClientMmapTableEntry& entry = mmap_table_[store_fd_val]; + entry.fd = fd; entry.pointer = result; entry.length = map_size; - entry.count = 0; return result; } } @@ -342,6 +308,17 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID& object_id) { return (elem != objects_in_use_.end()); } +int PlasmaClient::Impl::GetStoreFd(int store_fd) { + auto entry = mmap_table_.find(store_fd); + if (entry == mmap_table_.end()) { + int fd = recv_fd(store_conn_); + ARROW_CHECK(fd >= 0) << "recv not successful"; + return fd; + } else { + return entry->second.fd; + } +} + void PlasmaClient::Impl::IncrementObjectCount(const ObjectID& object_id, PlasmaObject* object, bool is_sealed) { // Increment the count of the object to track the fact that it is being used. @@ -357,18 +334,6 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID& object_id, objects_in_use_[object_id]->count = 0; objects_in_use_[object_id]->is_sealed = is_sealed; object_entry = objects_in_use_[object_id].get(); - if (object->device_num == 0) { - // Increment the count of the number of objects in the memory-mapped file - // that are being used. The corresponding decrement should happen in - // PlasmaClient::Release. - auto entry = mmap_table_.find(object->store_fd); - ARROW_CHECK(entry != mmap_table_.end()); - ARROW_CHECK(entry->second.count >= 0); - // Update the in_use_object_bytes_. - in_use_object_bytes_ += - (object_entry->object.data_size + object_entry->object.metadata_size); - entry->second.count += 1; - } } else { object_entry = elem->second.get(); ARROW_CHECK(object_entry->count > 0); @@ -397,8 +362,7 @@ Status PlasmaClient::Impl::Create(const ObjectID& object_id, int64_t data_size, // If the CreateReply included an error, then the store will not send a file // descriptor. if (device_num == 0) { - int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0) << "recv not successful"; + int fd = GetStoreFd(store_fd); ARROW_CHECK(object.data_size == data_size); ARROW_CHECK(object.metadata_size == metadata_size); // The metadata should come right after the data. @@ -535,8 +499,7 @@ Status PlasmaClient::Impl::GetBuffers( // in the subsequent loop based on just the store file descriptor and without // having to know the relevant file descriptor received from recv_fd. for (size_t i = 0; i < store_fds.size(); i++) { - int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0); + int fd = GetStoreFd(store_fds[i]); LookupOrMmap(fd, store_fds[i], mmap_sizes[i]); } @@ -615,54 +578,21 @@ Status PlasmaClient::Impl::Get(const ObjectID* object_ids, int64_t num_objects, return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out); } -Status PlasmaClient::Impl::UnmapObject(const ObjectID& object_id) { +Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID& object_id) { auto object_entry = objects_in_use_.find(object_id); ARROW_CHECK(object_entry != objects_in_use_.end()); ARROW_CHECK(object_entry->second->count == 0); - // Decrement the count of the number of objects in this memory-mapped file - // that the client is using. The corresponding increment should have - // happened in plasma_get. - int fd = object_entry->second->object.store_fd; - auto entry = mmap_table_.find(fd); - ARROW_CHECK(entry != mmap_table_.end()); - ARROW_CHECK(entry->second.count >= 1); - if (entry->second.count == 1) { - // If no other objects are being used, then unmap the file. - // We subtract kMmapRegionsGap from the length that was added - // in fake_mmap in malloc.h, to make the size page-aligned again. - int err = munmap(entry->second.pointer, entry->second.length - kMmapRegionsGap); - if (err == -1) { - return Status::IOError("Error during munmap"); - } - // Remove the corresponding entry from the hash table. - mmap_table_.erase(fd); - } else { - // If there are other objects being used, decrement the reference count. - entry->second.count -= 1; - } - // Update the in_use_object_bytes_. - in_use_object_bytes_ -= (object_entry->second->object.data_size + - object_entry->second->object.metadata_size); - DCHECK_GE(in_use_object_bytes_, 0); // Remove the entry from the hash table of objects currently in use. objects_in_use_.erase(object_id); return Status::OK(); } -/// This is a helper method for implementing plasma_release. We maintain a -/// buffer -/// of release calls and only perform them once the buffer becomes full (as -/// judged by the aggregate sizes of the objects). There may be multiple release -/// calls for the same object ID in the buffer. In this case, the first release -/// calls will not do anything. The client will only send a message to the store -/// releasing the object when the client is truly done with the object. -/// -/// @param object_id The object ID to attempt to release. -Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { - // Decrement the count of the number of instances of this object that are - // being used by this client. The corresponding increment should have happened - // in PlasmaClient::Get. +Status PlasmaClient::Impl::Release(const ObjectID& object_id) { + // If the client is already disconnected, ignore release requests. + if (store_conn_ < 0) { + return Status::OK(); + } auto object_entry = objects_in_use_.find(object_id); ARROW_CHECK(object_entry != objects_in_use_.end()); object_entry->second->count -= 1; @@ -670,7 +600,7 @@ Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { // Check if the client is no longer using this object. if (object_entry->second->count == 0) { // Tell the store that the client no longer needs the object. - RETURN_NOT_OK(UnmapObject(object_id)); + RETURN_NOT_OK(MarkObjectUnused(object_id)); RETURN_NOT_OK(SendReleaseRequest(store_conn_, object_id)); auto iter = deletion_cache_.find(object_id); if (iter != deletion_cache_.end()) { @@ -681,50 +611,6 @@ Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { return Status::OK(); } -Status PlasmaClient::Impl::Release(const ObjectID& object_id) { - // If the client is already disconnected, ignore release requests. - if (store_conn_ < 0) { - return Status::OK(); - } - // If an object is in the deletion cache, handle it directly without waiting. - auto iter = deletion_cache_.find(object_id); - if (iter != deletion_cache_.end()) { - RETURN_NOT_OK(PerformRelease(object_id)); - return Status::OK(); - } - // Add the new object to the release history. - release_history_.push_front(object_id); - // If there are too many bytes in use by the client or if there are too many - // pending release calls, and there are at least some pending release calls in - // the release_history list, then release some objects. - - // TODO(wap): Eviction policy only works on host memory, and thus objects on - // the GPU cannot be released currently. - while ((in_use_object_bytes_ > std::min(kL3CacheSizeBytes, store_capacity_ / 100) || - release_history_.size() > config_.release_delay) && - release_history_.size() > 0) { - // Perform a release for the object ID for the first pending release. - RETURN_NOT_OK(PerformRelease(release_history_.back())); - // Remove the last entry from the release history. - release_history_.pop_back(); - } - return Status::OK(); -} - -Status PlasmaClient::Impl::FlushReleaseHistory() { - // If the client is already disconnected, ignore the flush. - if (store_conn_ < 0) { - return Status::OK(); - } - while (release_history_.size() > 0) { - // Perform a release for the object ID for the first pending release. - RETURN_NOT_OK(PerformRelease(release_history_.back())); - // Remove the last entry from the release history. - release_history_.pop_back(); - } - return Status::OK(); -} - // This method is used to query whether the plasma store contains an object. Status PlasmaClient::Impl::Contains(const ObjectID& object_id, bool* has_object) { // Check if we already have a reference to the object. @@ -855,8 +741,6 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { ARROW_CHECK(!object_entry->second->is_sealed) << "Plasma client called abort on a sealed object"; - // Flush the release history. - RETURN_NOT_OK(FlushReleaseHistory()); // Make sure that the Plasma client only has one reference to the object. If // it has more, then the client needs to release the buffer before calling // abort. @@ -868,7 +752,7 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { RETURN_NOT_OK(SendAbortRequest(store_conn_, object_id)); // Decrease the reference count to zero, then remove the object. object_entry->second->count--; - RETURN_NOT_OK(UnmapObject(object_id)); + RETURN_NOT_OK(MarkObjectUnused(object_id)); std::vector buffer; ObjectID id; @@ -878,7 +762,6 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { } Status PlasmaClient::Impl::Delete(const std::vector& object_ids) { - RETURN_NOT_OK(FlushReleaseHistory()); std::vector not_in_use_ids; for (auto& object_id : object_ids) { // If the object is in used, skip it. @@ -976,13 +859,12 @@ Status PlasmaClient::Impl::Connect(const std::string& store_socket_name, int release_delay, int num_retries) { RETURN_NOT_OK(ConnectIpcSocketRetry(store_socket_name, num_retries, -1, &store_conn_)); if (manager_socket_name != "") { - RETURN_NOT_OK( - ConnectIpcSocketRetry(manager_socket_name, num_retries, -1, &manager_conn_)); - } else { - manager_conn_ = -1; + return Status::NotImplemented("plasma manager is no longer supported"); + } + if (release_delay != 0) { + ARROW_LOG(WARNING) << "The release_delay parameter in PlasmaClient::Connect " + << "is deprecated"; } - config_.release_delay = release_delay; - in_use_object_bytes_ = 0; // Send a ConnectRequest to the store to get its memory capacity. RETURN_NOT_OK(SendConnectRequest(store_conn_)); std::vector buffer; @@ -1000,78 +882,6 @@ Status PlasmaClient::Impl::Disconnect() { // that were in use by us when handling the SIGPIPE. close(store_conn_); store_conn_ = -1; - if (manager_conn_ >= 0) { - close(manager_conn_); - manager_conn_ = -1; - } - return Status::OK(); -} - -Status PlasmaClient::Impl::Transfer(const char* address, int port, - const ObjectID& object_id) { - return SendDataRequest(manager_conn_, object_id, address, port); -} - -Status PlasmaClient::Impl::Fetch(int num_object_ids, const ObjectID* object_ids) { - ARROW_CHECK(manager_conn_ >= 0); - return SendFetchRequest(manager_conn_, object_ids, num_object_ids); -} - -int PlasmaClient::Impl::get_manager_fd() const { return manager_conn_; } - -Status PlasmaClient::Impl::Info(const ObjectID& object_id, int* object_status) { - ARROW_CHECK(manager_conn_ >= 0); - - RETURN_NOT_OK(SendStatusRequest(manager_conn_, &object_id, 1)); - std::vector buffer; - RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType::PlasmaStatusReply, &buffer)); - ObjectID id; - RETURN_NOT_OK(ReadStatusReply(buffer.data(), buffer.size(), &id, object_status, 1)); - ARROW_CHECK(object_id == id); - return Status::OK(); -} - -Status PlasmaClient::Impl::Wait(int64_t num_object_requests, - ObjectRequest* object_requests, int num_ready_objects, - int64_t timeout_ms, int* num_objects_ready) { - ARROW_CHECK(manager_conn_ >= 0); - ARROW_CHECK(num_object_requests > 0); - ARROW_CHECK(num_ready_objects > 0); - ARROW_CHECK(num_ready_objects <= num_object_requests); - - for (int i = 0; i < num_object_requests; ++i) { - ARROW_CHECK(object_requests[i].type == ObjectRequestType::PLASMA_QUERY_LOCAL || - object_requests[i].type == ObjectRequestType::PLASMA_QUERY_ANYWHERE); - } - - RETURN_NOT_OK(SendWaitRequest(manager_conn_, object_requests, num_object_requests, - num_ready_objects, timeout_ms)); - std::vector buffer; - RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType::PlasmaWaitReply, &buffer)); - RETURN_NOT_OK( - ReadWaitReply(buffer.data(), buffer.size(), object_requests, &num_ready_objects)); - - *num_objects_ready = 0; - for (int i = 0; i < num_object_requests; ++i) { - ObjectRequestType type = object_requests[i].type; - auto status = static_cast(object_requests[i].location); - switch (type) { - case ObjectRequestType::PLASMA_QUERY_LOCAL: - if (status == fb::ObjectStatus::Local) { - *num_objects_ready += 1; - } - break; - case ObjectRequestType::PLASMA_QUERY_ANYWHERE: - if (status == fb::ObjectStatus::Local || status == fb::ObjectStatus::Remote) { - *num_objects_ready += 1; - } else { - ARROW_CHECK(status == fb::ObjectStatus::Nonexistent); - } - break; - default: - ARROW_LOG(FATAL) << "This code should be unreachable."; - } - } return Status::OK(); } @@ -1154,29 +964,6 @@ Status PlasmaClient::DecodeNotification(const uint8_t* buffer, ObjectID* object_ Status PlasmaClient::Disconnect() { return impl_->Disconnect(); } -Status PlasmaClient::Fetch(int num_object_ids, const ObjectID* object_ids) { - return impl_->Fetch(num_object_ids, object_ids); -} - -Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready) { - return impl_->Wait(num_object_requests, object_requests, num_ready_objects, timeout_ms, - num_objects_ready); -} - -Status PlasmaClient::Transfer(const char* addr, int port, const ObjectID& object_id) { - return impl_->Transfer(addr, port, object_id); -} - -Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { - return impl_->Info(object_id, object_status); -} - -int PlasmaClient::get_manager_fd() const { return impl_->get_manager_fd(); } - -Status PlasmaClient::FlushReleaseHistory() { return impl_->FlushReleaseHistory(); } - bool PlasmaClient::IsInUse(const ObjectID& object_id) { return impl_->IsInUse(object_id); } diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 9e080b7760dc8..ac9e8eb0fe9c9 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -34,11 +34,6 @@ using arrow::Status; namespace plasma { -/// We keep a queue of unreleased objects cached in the client until we start -/// sending release requests to the store. This is to avoid frequently mapping -/// and unmapping objects and evicting data from processor caches. -constexpr int64_t kPlasmaDefaultReleaseDelay = 64; - /// Object buffer data structure. struct ObjectBuffer { /// The data buffer. @@ -54,21 +49,21 @@ class ARROW_EXPORT PlasmaClient { PlasmaClient(); ~PlasmaClient(); - /// Connect to the local plasma store and plasma manager. Return - /// the resulting connection. + /// Connect to the local plasma store. Return the resulting connection. /// /// \param store_socket_name The name of the UNIX domain socket to use to /// connect to the Plasma store. /// \param manager_socket_name The name of the UNIX domain socket to use to /// connect to the local Plasma manager. If this is "", then this /// function will not connect to a manager. - /// \param release_delay Number of released objects that are kept around - /// and not evicted to avoid too many munmaps. + /// Note that plasma manager is no longer supported, this function + /// will return failure if this is not "". + /// \param release_delay Deprecated (not used). /// \param num_retries number of attempts to connect to IPC socket, default 50 /// \return The return status. Status Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, - int release_delay = kPlasmaDefaultReleaseDelay, int num_retries = -1); + const std::string& manager_socket_name = "", int release_delay = 0, + int num_retries = -1); /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. @@ -255,109 +250,12 @@ class ARROW_EXPORT PlasmaClient { /// \return The return status. Status Disconnect(); - /// Attempt to initiate the transfer of some objects from remote Plasma - /// Stores. - /// This method does not guarantee that the fetched objects will arrive - /// locally. - /// - /// For an object that is available in the local Plasma Store, this method - /// will - /// not do anything. For an object that is not available locally, it will - /// check - /// if the object are already being fetched. If so, it will not do anything. - /// If - /// not, it will query the object table for a list of Plasma Managers that - /// have - /// the object. The object table will return a non-empty list, and this Plasma - /// Manager will attempt to initiate transfers from one of those Plasma - /// Managers. - /// - /// This function is non-blocking. - /// - /// This method is idempotent in the sense that it is ok to call it multiple - /// times. - /// - /// \param num_object_ids The number of object IDs fetch is being called on. - /// \param object_ids The IDs of the objects that fetch is being called on. - /// \return The return status. - Status Fetch(int num_object_ids, const ObjectID* object_ids); - - /// Wait for (1) a specified number of objects to be available (sealed) in the - /// local Plasma Store or in a remote Plasma Store, or (2) for a timeout to - /// expire. This is a blocking call. - /// - /// \param num_object_requests Size of the object_requests array. - /// \param object_requests Object event array. Each element contains a request - /// for a particular object_id. The type of request is specified in the - /// "type" field. - /// - A PLASMA_QUERY_LOCAL request is satisfied when object_id becomes - /// available in the local Plasma Store. In this case, this function - /// sets the "status" field to ObjectStatus::Local. Note, if the - /// status - /// is not ObjectStatus::Local, it will be ObjectStatus::Nonexistent, - /// but it may exist elsewhere in the system. - /// - A PLASMA_QUERY_ANYWHERE request is satisfied when object_id - /// becomes - /// available either at the local Plasma Store or on a remote Plasma - /// Store. In this case, the functions sets the "status" field to - /// ObjectStatus::Local or ObjectStatus::Remote. - /// \param num_ready_objects The number of requests in object_requests array - /// that - /// must be satisfied before the function returns, unless it timeouts. - /// The num_ready_objects should be no larger than num_object_requests. - /// \param timeout_ms Timeout value in milliseconds. If this timeout expires - /// before min_num_ready_objects of requests are satisfied, the - /// function - /// returns. - /// \param num_objects_ready Out parameter for number of satisfied requests in - /// the object_requests list. If the returned number is less than - /// min_num_ready_objects this means that timeout expired. - /// \return The return status. - Status Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); - - /// Transfer local object to a different plasma manager. - /// - /// \param addr IP address of the plasma manager we are transfering to. - /// \param port Port of the plasma manager we are transfering to. - /// \param object_id ObjectID of the object we are transfering. - /// \return The return status. - Status Transfer(const char* addr, int port, const ObjectID& object_id); - - /// Return the status of a given object. This method may query the object - /// table. - /// - /// \param object_id The ID of the object whose status we query. - /// \param object_status Out parameter for object status. Can take the - /// following values. - /// - PLASMA_CLIENT_LOCAL, if object is stored in the local Plasma - /// Store. - /// has been already scheduled by the Plasma Manager. - /// - PLASMA_CLIENT_TRANSFER, if the object is either currently being - /// transferred or just scheduled. - /// - PLASMA_CLIENT_REMOTE, if the object is stored at a remote - /// Plasma Store. - /// - PLASMA_CLIENT_DOES_NOT_EXIST, if the object doesn’t exist in the - /// system. - /// \return The return status. - Status Info(const ObjectID& object_id, int* object_status); - - /// Get the file descriptor for the socket connection to the plasma manager. - /// - /// \return The file descriptor for the manager connection. If there is no - /// connection to the manager, this is -1. - int get_manager_fd() const; - private: friend class PlasmaBuffer; FRIEND_TEST(TestPlasmaStore, GetTest); FRIEND_TEST(TestPlasmaStore, LegacyGetTest); FRIEND_TEST(TestPlasmaStore, AbortTest); - /// This is a helper method that flushes all pending release calls to the - /// store. - Status FlushReleaseHistory(); - bool IsInUse(const ObjectID& object_id); class ARROW_NO_EXPORT Impl; diff --git a/cpp/src/plasma/common.cc b/cpp/src/plasma/common.cc index 0ca17cf814f8a..1b86fd80b4920 100644 --- a/cpp/src/plasma/common.cc +++ b/cpp/src/plasma/common.cc @@ -107,9 +107,6 @@ bool UniqueID::operator==(const UniqueID& rhs) const { return std::memcmp(data(), rhs.data(), kUniqueIDSize) == 0; } -ARROW_EXPORT fb::ObjectStatus ObjectStatusLocal = fb::ObjectStatus::Local; -ARROW_EXPORT fb::ObjectStatus ObjectStatusRemote = fb::ObjectStatus::Remote; - const PlasmaStoreInfo* plasma_config; } // namespace plasma diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 7090428ff41c9..dfbd90c3aa553 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -33,7 +33,6 @@ #include "plasma/compat.h" #include "arrow/status.h" -#include "arrow/util/logging.h" #ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" #endif @@ -66,30 +65,6 @@ typedef UniqueID ObjectID; /// Size of object hash digests. constexpr int64_t kDigestSize = sizeof(uint64_t); -enum class ObjectRequestType : int { - /// Query for object in the local plasma store. - PLASMA_QUERY_LOCAL = 1, - /// Query for object in the local plasma store or in a remote plasma store. - PLASMA_QUERY_ANYWHERE -}; - -/// Object request data structure. Used for Wait. -struct ObjectRequest { - /// The ID of the requested object. If ID_NIL request any object. - ObjectID object_id; - /// Request associated to the object. It can take one of the following values: - /// - PLASMA_QUERY_LOCAL: return if or when the object is available in the - /// local Plasma Store. - /// - PLASMA_QUERY_ANYWHERE: return if or when the object is available in - /// the system (i.e., either in the local or a remote Plasma Store). - ObjectRequestType type; - /// Object location. This can be - /// - ObjectLocation::Local: object is ready at the local Plasma Store. - /// - ObjectLocation::Remote: object is ready at a remote Plasma Store. - /// - ObjectLocation::Nonexistent: object does not exist in the system. - ObjectLocation location; -}; - enum class ObjectState : int { /// Object was created but not sealed in the local Plasma Store. PLASMA_CREATED = 1, @@ -97,6 +72,12 @@ enum class ObjectState : int { PLASMA_SEALED }; +namespace internal { + +struct CudaIpcPlaceholder {}; + +} // namespace internal + /// This type is used by the Plasma store. It is here because it is exposed to /// the eviction policy. struct ObjectTableEntry { @@ -118,10 +99,6 @@ struct ObjectTableEntry { int64_t data_size; /// Size of the object metadata in bytes. int64_t metadata_size; -#ifdef PLASMA_CUDA - /// IPC GPU handle to share with clients. - std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle; -#endif /// Number of clients currently using this object. int ref_count; /// Unix epoch of when this object was created. @@ -133,6 +110,13 @@ struct ObjectTableEntry { ObjectState state; /// The digest of the object. Used to see if two objects are the same. unsigned char digest[kDigestSize]; + +#ifdef PLASMA_CUDA + /// IPC GPU handle to share with clients. + std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle; +#else + std::shared_ptr ipc_handle; +#endif }; /// Mapping from ObjectIDs to information about the object. diff --git a/cpp/src/plasma/eviction_policy.cc b/cpp/src/plasma/eviction_policy.cc index e5beb5a579e28..4fb0cce81ecee 100644 --- a/cpp/src/plasma/eviction_policy.cc +++ b/cpp/src/plasma/eviction_policy.cc @@ -85,7 +85,7 @@ bool EvictionPolicy::RequireSpace(int64_t size, std::vector* objects_t ARROW_LOG(INFO) << "There is not enough space to create this object, so evicting " << objects_to_evict->size() << " objects to free up " << num_bytes_evicted << " bytes. The number of bytes in use (before " - << "this eviction) is " << memory_used_ << "."; + << "this eviction) is " << (memory_used_ + num_bytes_evicted) << "."; return num_bytes_evicted >= required_space && num_bytes_evicted > 0; } diff --git a/cpp/src/plasma/format/plasma.fbs b/cpp/src/plasma/format/plasma.fbs index ef934fbd81ed2..b3c890391887e 100644 --- a/cpp/src/plasma/format/plasma.fbs +++ b/cpp/src/plasma/format/plasma.fbs @@ -42,9 +42,6 @@ enum MessageType:long { // Delete an object. PlasmaDeleteRequest, PlasmaDeleteReply, - // Get status of an object. - PlasmaStatusRequest, - PlasmaStatusReply, // See if the store contains an object (will be deprecated). PlasmaContainsRequest, PlasmaContainsReply, @@ -57,11 +54,6 @@ enum MessageType:long { // Make room for new objects in the plasma store. PlasmaEvictRequest, PlasmaEvictReply, - // Fetch objects from remote Plasma stores. - PlasmaFetchRequest, - // Wait for objects to be ready either from local or remote Plasma stores. - PlasmaWaitRequest, - PlasmaWaitReply, // Subscribe to a list of objects or to all objects. PlasmaSubscribeRequest, // Unsubscribe. @@ -239,35 +231,6 @@ table PlasmaDeleteReply { errors: [PlasmaError]; } -table PlasmaStatusRequest { - // IDs of the objects stored at local Plasma store we request the status of. - object_ids: [string]; -} - -enum ObjectStatus:int { - // Object is stored in the local Plasma Store. - Local, - // Object is stored on a remote Plasma store, and it is not stored on the - // local Plasma Store. - Remote, - // Object is not stored in the system. - Nonexistent, - // Object is currently transferred from a remote Plasma store the local - // Plasma Store. - Transfer -} - -table PlasmaStatusReply { - // IDs of the objects being returned. - object_ids: [string]; - // Status of the object. - status: [ObjectStatus]; -} - -// PlasmaContains is a subset of PlasmaStatus which does not -// involve the plasma manager, only the store. We should consider -// unifying them in the future and deprecating PlasmaContains. - table PlasmaContainsRequest { // ID of the object we are querying. object_id: string; @@ -309,43 +272,6 @@ table PlasmaEvictReply { num_bytes: ulong; } -table PlasmaFetchRequest { - // IDs of objects to be gotten. - object_ids: [string]; -} - -table ObjectRequestSpec { - // ID of the object. - object_id: string; - // The type of the object. This specifies whether we - // will be waiting for an object store in the local or - // global Plasma store. - type: int; -} - -table PlasmaWaitRequest { - // Array of object requests whose status we are asking for. - object_requests: [ObjectRequestSpec]; - // Number of objects expected to be returned, if available. - num_ready_objects: int; - // timeout - timeout: long; -} - -table ObjectReply { - // ID of the object. - object_id: string; - // The object status. This specifies where the object is stored. - status: ObjectStatus; -} - -table PlasmaWaitReply { - // Array of object requests being returned. - object_requests: [ObjectReply]; - // Number of objects expected to be returned, if available. - num_ready_objects: int; -} - table PlasmaSubscribeRequest { } diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index d63ceb6da24da..cc425428ecee5 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -22,6 +22,7 @@ #include #include "arrow/status.h" +#include "arrow/util/logging.h" #include "plasma/common.h" #include "plasma/plasma_generated.h" @@ -49,7 +50,7 @@ Status WriteBytes(int fd, uint8_t* cursor, size_t length) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } - return Status::IOError(std::string(strerror(errno))); + return Status::IOError(strerror(errno)); } else if (nbytes == 0) { return Status::IOError("Encountered unexpected EOF"); } @@ -80,7 +81,7 @@ Status ReadBytes(int fd, uint8_t* cursor, size_t length) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } - return Status::IOError(std::string(strerror(errno))); + return Status::IOError(strerror(errno)); } else if (0 == nbytes) { return Status::IOError("Encountered unexpected EOF"); } @@ -171,12 +172,12 @@ Status ConnectIpcSocketRetry(const std::string& pathname, int num_retries, *fd = ConnectIpcSock(pathname); --num_retries; } + // If we could not connect to the socket, exit. if (*fd == -1) { - std::stringstream ss; - ss << "Could not connect to socket " << pathname; - return Status::IOError(ss.str()); + return Status::IOError("Could not connect to socket ", pathname); } + return Status::OK(); } diff --git a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc index 7cd2f3574423c..1988742af9bc7 100644 --- a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc +++ b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc @@ -28,6 +28,8 @@ #include #include +#include "arrow/util/logging.h" + #include "plasma/client.h" constexpr jsize OBJECT_ID_SIZE = sizeof(plasma::ObjectID) / sizeof(jbyte); @@ -102,15 +104,15 @@ JNIEXPORT jobject JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_create( std::shared_ptr data; Status s = client->Create(oid, size, md, md_size, &data); if (s.IsPlasmaObjectExists()) { - jclass Exception = env->FindClass("java/lang/Exception"); - env->ThrowNew(Exception, - "An object with this ID already exists in the plasma store."); + jclass exceptionClass = + env->FindClass("org/apache/arrow/plasma/exceptions/DuplicateObjectException"); + env->ThrowNew(exceptionClass, oid.hex().c_str()); return nullptr; } if (s.IsPlasmaStoreFull()) { - jclass Exception = env->FindClass("java/lang/Exception"); - env->ThrowNew(Exception, - "The plasma store ran out of memory and could not create this object."); + jclass exceptionClass = + env->FindClass("org/apache/arrow/plasma/exceptions/PlasmaOutOfMemoryException"); + env->ThrowNew(exceptionClass, ""); return nullptr; } ARROW_CHECK(s.ok()); @@ -220,79 +222,6 @@ JNIEXPORT jboolean JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_contains return has_object; } -JNIEXPORT void JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_fetch( - JNIEnv* env, jclass cls, jlong conn, jobjectArray object_ids) { - plasma::PlasmaClient* client = reinterpret_cast(conn); - jsize num_oids = env->GetArrayLength(object_ids); - - std::vector oids(num_oids); - for (int i = 0; i < num_oids; ++i) { - jbyteArray_to_object_id( - env, reinterpret_cast(env->GetObjectArrayElement(object_ids, i)), - &oids[i]); - } - - ARROW_CHECK_OK(client->Fetch(static_cast(num_oids), oids.data())); - - return; -} - -JNIEXPORT jobjectArray JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_wait( - JNIEnv* env, jclass cls, jlong conn, jobjectArray object_ids, jint timeout_ms, - jint num_returns) { - plasma::PlasmaClient* client = reinterpret_cast(conn); - jsize num_oids = env->GetArrayLength(object_ids); - - if (num_returns < 0) { - jclass Exception = env->FindClass("java/lang/RuntimeException"); - env->ThrowNew(Exception, "The argument num_returns cannot be less than zero."); - return nullptr; - } - if (num_returns > num_oids) { - jclass Exception = env->FindClass("java/lang/RuntimeException"); - env->ThrowNew(Exception, - "The argument num_returns cannot be greater than len(object_ids)."); - return nullptr; - } - - std::vector oreqs(num_oids); - - for (int i = 0; i < num_oids; ++i) { - jbyteArray_to_object_id( - env, reinterpret_cast(env->GetObjectArrayElement(object_ids, i)), - &oreqs[i].object_id); - oreqs[i].type = plasma::ObjectRequestType::PLASMA_QUERY_ANYWHERE; - } - - int num_return_objects; - // TODO: may be blocked. consider to add the thread support - ARROW_CHECK_OK(client->Wait(static_cast(num_oids), oreqs.data(), num_returns, - static_cast(timeout_ms), &num_return_objects)); - - int num_to_return = std::min(num_return_objects, num_returns); - jclass clsByteArray = env->FindClass("[B"); - jobjectArray ret = env->NewObjectArray(num_to_return, clsByteArray, nullptr); - - int num_returned = 0; - jbyteArray oid = nullptr; - for (int i = 0; i < num_oids; ++i) { - if (num_returned >= num_to_return) { - break; - } - - if (oreqs[i].location == plasma::ObjectLocation::Local || - oreqs[i].location == plasma::ObjectLocation::Remote) { - oid = env->NewByteArray(OBJECT_ID_SIZE); - object_id_to_jbyteArray(env, oid, &oreqs[i].object_id); - env->SetObjectArrayElement(ret, num_returned, oid); - num_returned++; - } - } - ARROW_CHECK(num_returned == num_to_return); - - return ret; -} - JNIEXPORT jlong JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_evict( JNIEnv* env, jclass cls, jlong conn, jlong num_bytes) { plasma::PlasmaClient* client = reinterpret_cast(conn); diff --git a/cpp/src/plasma/plasma.cc b/cpp/src/plasma/plasma.cc index 601a612be4071..a1749d0865d50 100644 --- a/cpp/src/plasma/plasma.cc +++ b/cpp/src/plasma/plasma.cc @@ -22,6 +22,7 @@ #include #include "plasma/common.h" +#include "plasma/common_generated.h" #include "plasma/protocol.h" namespace fb = plasma::flatbuf; diff --git a/cpp/src/plasma/plasma.h b/cpp/src/plasma/plasma.h index 83caec7ee4958..3d3eca19acc63 100644 --- a/cpp/src/plasma/plasma.h +++ b/cpp/src/plasma/plasma.h @@ -38,7 +38,6 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "plasma/common.h" -#include "plasma/common_generated.h" #ifdef PLASMA_CUDA using arrow::cuda::CudaIpcMemHandle; @@ -46,6 +45,10 @@ using arrow::cuda::CudaIpcMemHandle; namespace plasma { +namespace flatbuf { +struct ObjectInfoT; +} // namespace flatbuf + #define HANDLE_SIGPIPE(s, fd_) \ do { \ Status _s = (s); \ @@ -68,9 +71,6 @@ constexpr int64_t kBlockSize = 64; struct Client; -/// Mapping from object IDs to type and status of the request. -typedef std::unordered_map ObjectRequestMap; - // TODO(pcm): Replace this by the flatbuffers message PlasmaObjectSpec. struct PlasmaObject { #ifdef PLASMA_CUDA diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index c437840874538..a878647718264 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -42,10 +42,6 @@ using flatbuffers::uoffset_t; #define PLASMA_CHECK_ENUM(x, y) \ static_assert(static_cast(x) == static_cast(y), "protocol mismatch") -PLASMA_CHECK_ENUM(ObjectLocation::Local, fb::ObjectStatus::Local); -PLASMA_CHECK_ENUM(ObjectLocation::Remote, fb::ObjectStatus::Remote); -PLASMA_CHECK_ENUM(ObjectLocation::Nonexistent, fb::ObjectStatus::Nonexistent); - flatbuffers::Offset>> ToFlatbuffer(flatbuffers::FlatBufferBuilder* fbb, const ObjectID* object_ids, int64_t num_objects) { @@ -367,56 +363,6 @@ Status ReadDeleteReply(uint8_t* data, size_t size, std::vector* object return Status::OK(); } -// Satus messages. - -Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaStatusRequest(fbb, ToFlatbuffer(&fbb, object_ids, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaStatusRequest, &fbb, message); -} - -Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], - int64_t num_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < num_objects; ++i) { - object_ids[i] = ObjectID::from_binary(message->object_ids()->Get(i)->str()); - } - return Status::OK(); -} - -Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], - int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaStatusReply(fbb, ToFlatbuffer(&fbb, object_ids, num_objects), - fbb.CreateVector(object_status, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaStatusReply, &fbb, message); -} - -int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - return message->object_ids()->size(); -} - -Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], - int object_status[], int64_t num_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < num_objects; ++i) { - object_ids[i] = ObjectID::from_binary(message->object_ids()->Get(i)->str()); - } - for (uoffset_t i = 0; i < num_objects; ++i) { - object_status[i] = message->status()->data()[i]; - } - return Status::OK(); -} - // Contains messages. Status SendContainsRequest(int sock, ObjectID object_id) { @@ -640,95 +586,6 @@ Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], } return Status::OK(); } -// Fetch messages. - -Status SendFetchRequest(int sock, const ObjectID* object_ids, int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaFetchRequest(fbb, ToFlatbuffer(&fbb, object_ids, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaFetchRequest, &fbb, message); -} - -Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& object_ids) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < message->object_ids()->size(); ++i) { - object_ids.push_back(ObjectID::from_binary(message->object_ids()->Get(i)->str())); - } - return Status::OK(); -} - -// Wait messages. - -Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms) { - flatbuffers::FlatBufferBuilder fbb; - - std::vector> object_request_specs; - for (int i = 0; i < num_requests; i++) { - object_request_specs.push_back(fb::CreateObjectRequestSpec( - fbb, fbb.CreateString(object_requests[i].object_id.binary()), - static_cast(object_requests[i].type))); - } - - auto message = fb::CreatePlasmaWaitRequest(fbb, fbb.CreateVector(object_request_specs), - num_ready_objects, timeout_ms); - return PlasmaSend(sock, MessageType::PlasmaWaitRequest, &fbb, message); -} - -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - *num_ready_objects = message->num_ready_objects(); - *timeout_ms = message->timeout(); - - for (uoffset_t i = 0; i < message->object_requests()->size(); i++) { - ObjectID object_id = - ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); - ObjectRequest object_request( - {object_id, - static_cast(message->object_requests()->Get(i)->type()), - ObjectLocation::Nonexistent}); - object_requests[object_id] = object_request; - } - return Status::OK(); -} - -Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, - int num_ready_objects) { - flatbuffers::FlatBufferBuilder fbb; - - std::vector> object_replies; - for (const auto& entry : object_requests) { - const auto& object_request = entry.second; - object_replies.push_back( - fb::CreateObjectReply(fbb, fbb.CreateString(object_request.object_id.binary()), - static_cast(object_request.location))); - } - - auto message = fb::CreatePlasmaWaitReply( - fbb, fbb.CreateVector(object_replies.data(), num_ready_objects), num_ready_objects); - return PlasmaSend(sock, MessageType::PlasmaWaitReply, &fbb, message); -} - -Status ReadWaitReply(uint8_t* data, size_t size, ObjectRequest object_requests[], - int* num_ready_objects) { - DCHECK(data); - - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - *num_ready_objects = message->num_ready_objects(); - for (int i = 0; i < *num_ready_objects; i++) { - object_requests[i].object_id = - ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); - object_requests[i].location = - static_cast(message->object_requests()->Get(i)->status()); - } - return Status::OK(); -} // Subscribe messages. diff --git a/cpp/src/plasma/protocol.h b/cpp/src/plasma/protocol.h index c8204584b8adb..0362bd47797d4 100644 --- a/cpp/src/plasma/protocol.h +++ b/cpp/src/plasma/protocol.h @@ -128,21 +128,6 @@ Status SendDeleteReply(int sock, const std::vector& object_ids, Status ReadDeleteReply(uint8_t* data, size_t size, std::vector* object_ids, std::vector* errors); -/* Satus messages. */ - -Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objects); - -Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], - int64_t num_objects); - -Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], - int64_t num_objects); - -int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size); - -Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], - int object_status[], int64_t num_objects); - /* Plasma Constains message functions. */ Status SendContainsRequest(int sock, ObjectID object_id); @@ -184,26 +169,6 @@ Status SendEvictReply(int sock, int64_t num_bytes); Status ReadEvictReply(uint8_t* data, size_t size, int64_t& num_bytes); -/* Plasma Fetch Remote message functions. */ - -Status SendFetchRequest(int sock, const ObjectID* object_ids, int64_t num_objects); - -Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& object_ids); - -/* Plasma Wait message functions. */ - -Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms); - -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects); - -Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, - int num_ready_objects); - -Status ReadWaitReply(uint8_t* data, size_t size, ObjectRequest object_requests[], - int* num_ready_objects); - /* Plasma Subscribe message functions. */ Status SendSubscribeRequest(int sock); diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index ae658d757c185..343ccf5b886f2 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -52,11 +52,14 @@ #include #include +#include "arrow/status.h" + #include "plasma/common.h" #include "plasma/common_generated.h" #include "plasma/fling.h" #include "plasma/io.h" #include "plasma/malloc.h" +#include "plasma/protocol.h" #ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" @@ -327,7 +330,12 @@ void PlasmaStore::ReturnFromGet(GetRequest* get_req) { if (s.ok()) { // Send all of the file descriptors for the present objects. for (int store_fd : store_fds) { - WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); + // Only send the file descriptor if it hasn't been sent (see analogous + // logic in GetStoreFd in client.cc). + if (get_req->client->used_fds.find(store_fd) == get_req->client->used_fds.end()) { + WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); + get_req->client->used_fds.insert(store_fd); + } } } @@ -783,8 +791,12 @@ Status PlasmaStore::ProcessMessage(Client* client) { HANDLE_SIGPIPE( SendCreateReply(client->fd, object_id, &object, error_code, mmap_size), client->fd); - if (error_code == PlasmaError::OK && device_num == 0) { + // Only send the file descriptor if it hasn't been sent (see analogous + // logic in GetStoreFd in client.cc). Similar in ReturnFromGet. + if (error_code == PlasmaError::OK && device_num == 0 && + client->used_fds.find(object.store_fd) == client->used_fds.end()) { WarnIfSigpipe(send_fd(client->fd, object.store_fd), client->fd); + client->used_fds.insert(object.store_fd); } } break; case fb::MessageType::PlasmaCreateAndSealRequest: { @@ -893,21 +905,22 @@ class PlasmaStoreRunner { PlasmaStoreRunner() {} void Start(char* socket_name, int64_t system_memory, std::string directory, - bool hugepages_enabled, bool use_one_memory_mapped_file) { + bool hugepages_enabled) { // Create the event loop. loop_.reset(new EventLoop); store_.reset( new PlasmaStore(loop_.get(), system_memory, directory, hugepages_enabled)); plasma_config = store_->GetPlasmaStoreInfo(); - // If the store is configured to use a single memory-mapped file, then we - // achieve that by mallocing and freeing a single large amount of space. - // that maximum allowed size up front. - if (use_one_memory_mapped_file) { - void* pointer = plasma::dlmemalign(kBlockSize, system_memory); - ARROW_CHECK(pointer != nullptr); - plasma::dlfree(pointer); - } + // We are using a single memory-mapped file by mallocing and freeing a single + // large amount of space up front. According to the documentation, + // dlmalloc might need up to 128*sizeof(size_t) bytes for internal + // bookkeeping. + void* pointer = plasma::dlmemalign(kBlockSize, system_memory - 256 * sizeof(size_t)); + ARROW_CHECK(pointer != nullptr); + // This will unmap the file, but the next one created will be as large + // as this one (this is an implementation detail of dlmalloc). + plasma::dlfree(pointer); int socket = BindIpcSock(socket_name, true); // TODO(pcm): Check return value. @@ -943,15 +956,14 @@ void HandleSignal(int signal) { } void StartServer(char* socket_name, int64_t system_memory, std::string plasma_directory, - bool hugepages_enabled, bool use_one_memory_mapped_file) { + bool hugepages_enabled) { // Ignore SIGPIPE signals. If we don't do this, then when we attempt to write // to a client that has already died, the store could die. signal(SIGPIPE, SIG_IGN); g_runner.reset(new PlasmaStoreRunner()); signal(SIGTERM, HandleSignal); - g_runner->Start(socket_name, system_memory, plasma_directory, hugepages_enabled, - use_one_memory_mapped_file); + g_runner->Start(socket_name, system_memory, plasma_directory, hugepages_enabled); } } // namespace plasma @@ -963,11 +975,9 @@ int main(int argc, char* argv[]) { // Directory where plasma memory mapped files are stored. std::string plasma_directory; bool hugepages_enabled = false; - // True if a single large memory-mapped file should be created at startup. - bool use_one_memory_mapped_file = false; int64_t system_memory = -1; int c; - while ((c = getopt(argc, argv, "s:m:d:hf")) != -1) { + while ((c = getopt(argc, argv, "s:m:d:h")) != -1) { switch (c) { case 'd': plasma_directory = std::string(optarg); @@ -982,14 +992,16 @@ int main(int argc, char* argv[]) { char extra; int scanned = sscanf(optarg, "%" SCNd64 "%c", &system_memory, &extra); ARROW_CHECK(scanned == 1); + // Set system memory, potentially rounding it to a page size + // Also make it so dlmalloc fails if we try to request more memory than + // is available. + system_memory = + plasma::dlmalloc_set_footprint_limit(static_cast(system_memory)); ARROW_LOG(INFO) << "Allowing the Plasma store to use up to " << static_cast(system_memory) / 1000000000 << "GB of memory."; break; } - case 'f': - use_one_memory_mapped_file = true; - break; default: exit(-1); } @@ -1039,12 +1051,8 @@ int main(int argc, char* argv[]) { SetMallocGranularity(1024 * 1024 * 1024); // 1 GB } #endif - // Make it so dlmalloc fails if we try to request more memory than is - // available. - plasma::dlmalloc_set_footprint_limit((size_t)system_memory); ARROW_LOG(DEBUG) << "starting server listening on " << socket_name; - plasma::StartServer(socket_name, system_memory, plasma_directory, hugepages_enabled, - use_one_memory_mapped_file); + plasma::StartServer(socket_name, system_memory, plasma_directory, hugepages_enabled); plasma::g_runner->Shutdown(); plasma::g_runner = nullptr; diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 8d3facd733f1c..9866e74576f00 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -29,10 +29,18 @@ #include "plasma/events.h" #include "plasma/eviction_policy.h" #include "plasma/plasma.h" -#include "plasma/protocol.h" + +namespace arrow { +class Status; +} // namespace arrow namespace plasma { +namespace flatbuf { +struct ObjectInfoT; +enum class PlasmaError; +} // namespace flatbuf + using flatbuf::ObjectInfoT; using flatbuf::PlasmaError; @@ -54,6 +62,9 @@ struct Client { /// Object ids that are used by this client. std::unordered_set object_ids; + /// File descriptors that are used by this client. + std::unordered_set used_fds; + /// The file descriptor used to push notifications to client. This is only valid /// if client subscribes to plasma store. -1 indicates invalid. int notification_fd; @@ -173,7 +184,7 @@ class PlasmaStore { NotificationMap::iterator SendNotifications(NotificationMap::iterator it); - Status ProcessMessage(Client* client); + arrow::Status ProcessMessage(Client* client); private: void PushNotification(ObjectInfoT* object_notification); diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index f820303aba42b..1678e27f90f58 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -60,7 +60,7 @@ class TestPlasmaStore : public ::testing::Test { std::string plasma_directory = test_executable.substr(0, test_executable.find_last_of("/")); std::string plasma_command = plasma_directory + - "/plasma_store_server -m 1000000000 -s " + + "/plasma_store_server -m 10000000 -s " + store_socket_name_ + " 1> /dev/null 2> /dev/null &"; system(plasma_command.c_str()); ARROW_CHECK_OK(client_.Connect(store_socket_name_, "")); @@ -82,7 +82,7 @@ class TestPlasmaStore : public ::testing::Test { void CreateObject(PlasmaClient& client, const ObjectID& object_id, const std::vector& metadata, - const std::vector& data) { + const std::vector& data, bool release = true) { std::shared_ptr data_buffer; ARROW_CHECK_OK(client.Create(object_id, data.size(), &metadata[0], metadata.size(), &data_buffer)); @@ -90,7 +90,9 @@ class TestPlasmaStore : public ::testing::Test { data_buffer->mutable_data()[i] = data[i]; } ARROW_CHECK_OK(client.Seal(object_id)); - ARROW_CHECK_OK(client.Release(object_id)); + if (release) { + ARROW_CHECK_OK(client.Release(object_id)); + } } const std::string& GetStoreSocketName() const { return store_socket_name_; } @@ -155,11 +157,12 @@ TEST_F(TestPlasmaStore, SealErrorsTest) { // Create object. std::vector data(100, 0); - CreateObject(client_, object_id, {42}, data); + CreateObject(client_, object_id, {42}, data, false); // Trying to seal it again. result = client_.Seal(object_id); ASSERT_TRUE(result.IsPlasmaObjectAlreadySealed()); + ARROW_CHECK_OK(client_.Release(object_id)); } TEST_F(TestPlasmaStore, DeleteTest) { @@ -184,7 +187,6 @@ TEST_F(TestPlasmaStore, DeleteTest) { ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); ASSERT_TRUE(has_object); - // Avoid race condition of Plasma Manager waiting for notification. ARROW_CHECK_OK(client_.Release(object_id)); // object_id is marked as to-be-deleted, when it is not in use, it will be deleted. ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); @@ -228,13 +230,7 @@ TEST_F(TestPlasmaStore, DeleteObjectsTest) { // client2_ won't send the release request immediately because the trigger // condition is not reached. The release is only added to release cache. object_buffers.clear(); - // The reference count went to zero, but the objects are still in the release - // cache. - ARROW_CHECK_OK(client_.Contains(object_id1, &has_object)); - ASSERT_TRUE(has_object); - ARROW_CHECK_OK(client_.Contains(object_id2, &has_object)); - ASSERT_TRUE(has_object); - // The Delete call will flush release cache and send the Delete request. + // Delete the objects. result = client2_.Delete(std::vector{object_id1, object_id2}); ARROW_CHECK_OK(client_.Contains(object_id1, &has_object)); ASSERT_FALSE(has_object); @@ -254,7 +250,6 @@ TEST_F(TestPlasmaStore, ContainsTest) { // First create object. std::vector data(100, 0); CreateObject(client_, object_id, {42}, data); - // Avoid race condition of Plasma Manager waiting for notification. std::vector object_buffers; ARROW_CHECK_OK(client_.Get({object_id}, -1, &object_buffers)); ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); @@ -277,7 +272,6 @@ TEST_F(TestPlasmaStore, GetTest) { // First create object. std::vector data = {3, 5, 6, 7, 9}; CreateObject(client_, object_id, {42}, data); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); object_buffers.clear(); @@ -291,11 +285,9 @@ TEST_F(TestPlasmaStore, GetTest) { auto metadata = object_buffers[0].metadata; object_buffers.clear(); ::arrow::AssertBufferEqual(*metadata, std::string{42}); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); } // Object is automatically released - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); } @@ -314,17 +306,14 @@ TEST_F(TestPlasmaStore, LegacyGetTest) { // First create object. std::vector data = {3, 5, 6, 7, 9}; CreateObject(client_, object_id, {42}, data); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); AssertObjectBufferEqual(object_buffer, {42}, {3, 5, 6, 7, 9}); } // Object needs releasing manually - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Release(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); } @@ -377,11 +366,9 @@ TEST_F(TestPlasmaStore, AbortTest) { ASSERT_TRUE(status.IsInvalid()); // Release, then abort. ARROW_CHECK_OK(client_.Release(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Abort(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); // Test for object non-existence after the abort. @@ -394,7 +381,6 @@ TEST_F(TestPlasmaStore, AbortTest) { // Test that we can get the object. ARROW_CHECK_OK(client_.Get({object_id}, -1, &object_buffers)); AssertObjectBufferEqual(object_buffers[0], {42, 43}, {1, 2, 3, 4, 5}); - ARROW_CHECK_OK(client_.Release(object_id)); } TEST_F(TestPlasmaStore, MultipleClientTest) { diff --git a/cpp/src/plasma/test/serialization_tests.cc b/cpp/src/plasma/test/serialization_tests.cc index 085ae97db980f..66d651d2923bf 100644 --- a/cpp/src/plasma/test/serialization_tests.cc +++ b/cpp/src/plasma/test/serialization_tests.cc @@ -254,44 +254,6 @@ TEST(PlasmaSerialization, DeleteReply) { close(fd); } -TEST(PlasmaSerialization, StatusRequest) { - int fd = create_temp_file(); - constexpr int64_t num_objects = 2; - ObjectID object_ids[num_objects]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - ARROW_CHECK_OK(SendStatusRequest(fd, object_ids, num_objects)); - std::vector data = - read_message_from_file(fd, MessageType::PlasmaStatusRequest); - ObjectID object_ids_read[num_objects]; - ARROW_CHECK_OK( - ReadStatusRequest(data.data(), data.size(), object_ids_read, num_objects)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - close(fd); -} - -TEST(PlasmaSerialization, StatusReply) { - int fd = create_temp_file(); - ObjectID object_ids[2]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - int object_statuses[2] = {42, 43}; - ARROW_CHECK_OK(SendStatusReply(fd, object_ids, object_statuses, 2)); - std::vector data = read_message_from_file(fd, MessageType::PlasmaStatusReply); - int64_t num_objects = ReadStatusReply_num_objects(data.data(), data.size()); - - std::vector object_ids_read(num_objects); - std::vector object_statuses_read(num_objects); - ARROW_CHECK_OK(ReadStatusReply(data.data(), data.size(), object_ids_read.data(), - object_statuses_read.data(), num_objects)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - ASSERT_EQ(object_statuses[0], object_statuses_read[0]); - ASSERT_EQ(object_statuses[1], object_statuses_read[1]); - close(fd); -} - TEST(PlasmaSerialization, EvictRequest) { int fd = create_temp_file(); int64_t num_bytes = 111; @@ -314,84 +276,6 @@ TEST(PlasmaSerialization, EvictReply) { close(fd); } -TEST(PlasmaSerialization, FetchRequest) { - int fd = create_temp_file(); - ObjectID object_ids[2]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - ARROW_CHECK_OK(SendFetchRequest(fd, object_ids, 2)); - std::vector data = read_message_from_file(fd, MessageType::PlasmaFetchRequest); - std::vector object_ids_read; - ARROW_CHECK_OK(ReadFetchRequest(data.data(), data.size(), object_ids_read)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - close(fd); -} - -TEST(PlasmaSerialization, WaitRequest) { - int fd = create_temp_file(); - const int num_objects_in = 2; - ObjectRequest object_requests_in[num_objects_in] = { - ObjectRequest({random_object_id(), ObjectRequestType::PLASMA_QUERY_ANYWHERE, - ObjectLocation::Local}), - ObjectRequest({random_object_id(), ObjectRequestType::PLASMA_QUERY_LOCAL, - ObjectLocation::Local})}; - const int num_ready_objects_in = 1; - int64_t timeout_ms = 1000; - - ARROW_CHECK_OK(SendWaitRequest(fd, &object_requests_in[0], num_objects_in, - num_ready_objects_in, timeout_ms)); - /* Read message back. */ - std::vector data = read_message_from_file(fd, MessageType::PlasmaWaitRequest); - int num_ready_objects_out; - int64_t timeout_ms_read; - ObjectRequestMap object_requests_out; - ARROW_CHECK_OK(ReadWaitRequest(data.data(), data.size(), object_requests_out, - &timeout_ms_read, &num_ready_objects_out)); - ASSERT_EQ(num_objects_in, object_requests_out.size()); - ASSERT_EQ(num_ready_objects_out, num_ready_objects_in); - for (int i = 0; i < num_objects_in; i++) { - const ObjectID& object_id = object_requests_in[i].object_id; - ASSERT_EQ(1, object_requests_out.count(object_id)); - const auto& entry = object_requests_out.find(object_id); - ASSERT_TRUE(entry != object_requests_out.end()); - ASSERT_EQ(entry->second.object_id, object_requests_in[i].object_id); - ASSERT_EQ(entry->second.type, object_requests_in[i].type); - } - close(fd); -} - -TEST(PlasmaSerialization, WaitReply) { - int fd = create_temp_file(); - const int num_objects_in = 2; - /* Create a map with two ObjectRequests in it. */ - ObjectRequestMap objects_in(num_objects_in); - ObjectID id1 = random_object_id(); - objects_in[id1] = - ObjectRequest({id1, ObjectRequestType::PLASMA_QUERY_LOCAL, ObjectLocation::Local}); - ObjectID id2 = random_object_id(); - objects_in[id2] = ObjectRequest( - {id2, ObjectRequestType::PLASMA_QUERY_LOCAL, ObjectLocation::Nonexistent}); - - ARROW_CHECK_OK(SendWaitReply(fd, objects_in, num_objects_in)); - /* Read message back. */ - std::vector data = read_message_from_file(fd, MessageType::PlasmaWaitReply); - ObjectRequest objects_out[2]; - int num_objects_out; - ARROW_CHECK_OK( - ReadWaitReply(data.data(), data.size(), &objects_out[0], &num_objects_out)); - ASSERT_EQ(num_objects_in, num_objects_out); - for (int i = 0; i < num_objects_out; i++) { - /* Each object request must appear exactly once. */ - ASSERT_EQ(objects_in.count(objects_out[i].object_id), 1); - const auto& entry = objects_in.find(objects_out[i].object_id); - ASSERT_TRUE(entry != objects_in.end()); - ASSERT_EQ(entry->second.object_id, objects_out[i].object_id); - ASSERT_EQ(entry->second.location, objects_out[i].location); - } - close(fd); -} - TEST(PlasmaSerialization, DataRequest) { int fd = create_temp_file(); ObjectID object_id1 = random_object_id(); diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 46ae2605c2de3..bb7b6abbb3fbe 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 46ae2605c2de306f5740587107dcf333a527f2d1 +Subproject commit bb7b6abbb3fbeff845646364a4286142127be04c diff --git a/cpp/thirdparty/README.md b/cpp/thirdparty/README.md index bd1cb28d81818..9be3361e5d54f 100644 --- a/cpp/thirdparty/README.md +++ b/cpp/thirdparty/README.md @@ -29,17 +29,24 @@ offline builds. To set up your own specific build toolchain, here are the relevant environment variables +* brotli: `BROTLI_HOME`, can be disabled with `-DARROW_WITH_BROTLI=off` * Boost: `BOOST_ROOT` +* double-conversion: `DOUBLE_CONVERSION_HOME` * Googletest: `GTEST_HOME` (only required to build the unit tests) * gflags: `GFLAGS_HOME` (only required to build the unit tests) +* glog: `GLOG_HOME` (only required if `ARROW_USE_GLOG=ON`) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) * Flatbuffers: `FLATBUFFERS_HOME` (only required for -DARROW_IPC=on, which is the default) * Hadoop: `HADOOP_HOME` (only required for the HDFS I/O extensions) * jemalloc: `JEMALLOC_HOME` -* brotli: `BROTLI_HOME`, can be disabled with `-DARROW_WITH_BROTLI=off` * lz4: `LZ4_HOME`, can be disabled with `-DARROW_WITH_LZ4=off` +* Apache ORC: `ORC_HOME` +* protobuf: `PROTOBUF_HOME` +* rapidjson: `RAPIDJSON_HOME` +* re2: `RE2_HOME` (only required to build Gandiva currently) * snappy: `SNAPPY_HOME`, can be disabled with `-DARROW_WITH_SNAPPY=off` +* thrift: `THRIFT_HOME` * zlib: `ZLIB_HOME`, can be disabled with `-DARROW_WITH_ZLIB=off` * zstd: `ZSTD_HOME`, can be disabled with `-DARROW_WITH_ZSTD=off` @@ -69,24 +76,26 @@ script: ```shell # Download tarballs into `$HOME/arrow-thirdparty-deps` -$ ./thirdparty/download_dependencies $HOME/arrow-thirdparty-deps -# some output omitted - +$ ./thirdparty/download_dependencies $HOME/arrow-thirdparty # Environment variables for offline Arrow build -export ARROW_BOOST_URL=$HOME/arrow-thirdparty-deps/boost.tar.gz -export ARROW_GTEST_URL=$HOME/arrow-thirdparty-deps/gtest.tar.gz -export ARROW_GFLAGS_URL=$HOME/arrow-thirdparty-deps/gflags.tar.gz -export ARROW_GBENCHMARK_URL=$HOME/arrow-thirdparty-deps/gbenchmark.tar.gz -export ARROW_FLATBUFFERS_URL=$HOME/arrow-thirdparty-deps/flatbuffers.tar.gz -export ARROW_RAPIDJSON_URL=$HOME/arrow-thirdparty-deps/rapidjson.tar.gz -export ARROW_SNAPPY_URL=$HOME/arrow-thirdparty-deps/snappy.tar.gz -export ARROW_BROTLI_URL=$HOME/arrow-thirdparty-deps/brotli.tar.gz -export ARROW_LZ4_URL=$HOME/arrow-thirdparty-deps/lz4.tar.gz -export ARROW_ZLIB_URL=$HOME/arrow-thirdparty-deps/zlib.tar.gz -export ARROW_ZSTD_URL=$HOME/arrow-thirdparty-deps/zstd.tar.gz -export ARROW_PROTOBUF_URL=$HOME/arrow-thirdparty-deps/protobuf.tar.gz -export ARROW_GRPC_URL=$HOME/arrow-thirdparty-deps/grpc.tar.gz -export ARROW_ORC_URL=$HOME/arrow-thirdparty-deps/orc.tar.gz +export ARROW_BOOST_URL=$HOME/arrow-thirdparty/boost-1.67.0.tar.gz +export ARROW_BROTLI_URL=$HOME/arrow-thirdparty/brotli-v0.6.0.tar.gz +export ARROW_DOUBLE_CONVERSION_URL=$HOME/arrow-thirdparty/double-conversion-v3.1.1.tar.gz +export ARROW_FLATBUFFERS_URL=$HOME/arrow-thirdparty/flatbuffers-02a7807dd8d26f5668ffbbec0360dc107bbfabd5.tar.gz +export ARROW_GBENCHMARK_URL=$HOME/arrow-thirdparty/gbenchmark-v1.4.1.tar.gz +export ARROW_GFLAGS_URL=$HOME/arrow-thirdparty/gflags-v2.2.0.tar.gz +export ARROW_GLOG_URL=$HOME/arrow-thirdparty/glog-v0.3.5.tar.gz +export ARROW_GRPC_URL=$HOME/arrow-thirdparty/grpc-v1.14.1.tar.gz +export ARROW_GTEST_URL=$HOME/arrow-thirdparty/gtest-1.8.0.tar.gz +export ARROW_LZ4_URL=$HOME/arrow-thirdparty/lz4-v1.7.5.tar.gz +export ARROW_ORC_URL=$HOME/arrow-thirdparty/orc-1.5.4.tar.gz +export ARROW_PROTOBUF_URL=$HOME/arrow-thirdparty/protobuf-v3.6.1.tar.gz +export ARROW_RAPIDJSON_URL=$HOME/arrow-thirdparty/rapidjson-v1.1.0.tar.gz +export ARROW_RE2_URL=$HOME/arrow-thirdparty/re2-2018-10-01.tar.gz +export ARROW_SNAPPY_URL=$HOME/arrow-thirdparty/snappy-1.1.3.tar.gz +export ARROW_THRIFT_URL=$HOME/arrow-thirdparty/thrift-0.11.0.tar.gz +export ARROW_ZLIB_URL=$HOME/arrow-thirdparty/zlib-1.2.8.tar.gz +export ARROW_ZSTD_URL=$HOME/arrow-thirdparty/zstd-v1.3.7.tar.gz ``` This can be automated by using inline source/eval: diff --git a/cpp/thirdparty/download_dependencies.sh b/cpp/thirdparty/download_dependencies.sh index ea63a8a41fb4e..f782963dd1450 100755 --- a/cpp/thirdparty/download_dependencies.sh +++ b/cpp/thirdparty/download_dependencies.sh @@ -30,7 +30,7 @@ else DESTDIR=$1 fi -DESTDIR=$(realpath "${DESTDIR}") +DESTDIR=$(readlink -f "${DESTDIR}") download_dependency() { local url=$1 @@ -38,7 +38,7 @@ download_dependency() { # --show-progress will not output to stdout, it is safe to pipe the result of # the script into eval. - wget --quiet --show-progress --continue --output-document="${out}" "${url}" + wget --quiet --continue --output-document="${out}" "${url}" } main() { diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 705f56c0e6130..a2393b6fb3eb0 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -34,7 +34,7 @@ GRPC_VERSION=v1.14.1 GTEST_VERSION=1.8.0 JEMALLOC_VERSION=17c897976c60b0e6e4f4a365c751027244dada7a LZ4_VERSION=v1.7.5 -ORC_VERSION=1.5.1 +ORC_VERSION=1.5.4 PROTOBUF_VERSION=v3.6.1 RAPIDJSON_VERSION=v1.1.0 RE2_VERSION=2018-10-01 @@ -61,6 +61,7 @@ DEPENDENCIES=( "ARROW_ORC_URL orc-${ORC_VERSION}.tar.gz https://github.com/apache/orc/archive/rel/release-${ORC_VERSION}.tar.gz" "ARROW_PROTOBUF_URL protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/google/protobuf/releases/download/${PROTOBUF_VERSION}/protobuf-all-${PROTOBUF_VERSION:1}.tar.gz" "ARROW_RAPIDJSON_URL rapidjson-${RAPIDJSON_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${RAPIDJSON_VERSION}.tar.gz" + "ARROW_RE2_URL re2-${RE2_VERSION}.tar.gz https://github.com/google/re2/archive/${RE2_VERSION}.tar.gz" "ARROW_SNAPPY_URL snappy-${SNAPPY_VERSION}.tar.gz https://github.com/google/snappy/releases/download/${SNAPPY_VERSION}/snappy-${SNAPPY_VERSION}.tar.gz" "ARROW_THRIFT_URL thrift-${THRIFT_VERSION}.tar.gz http://archive.apache.org/dist/thrift/${THRIFT_VERSION}/thrift-${THRIFT_VERSION}.tar.gz" "ARROW_ZLIB_URL zlib-${ZLIB_VERSION}.tar.gz http://zlib.net/fossils/zlib-${ZLIB_VERSION}.tar.gz" diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 47aea28ff6828..bbbec29c13009 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -26,7 +26,9 @@ if (PARQUET_BUILD_EXECUTABLES) target_link_libraries(${TOOL} parquet_static) # Avoid unsetting RPATH when installing set_target_properties(${TOOL} PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) - install(TARGETS ${TOOL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + install(TARGETS ${TOOL} + ${INSTALL_IS_OPTIONAL} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) add_dependencies(parquet ${PARQUET_TOOLS}) diff --git a/cpp/tools/parquet/parquet-reader.cc b/cpp/tools/parquet/parquet-reader.cc index 34bdfc103dcc0..a5b7db1330a97 100644 --- a/cpp/tools/parquet/parquet-reader.cc +++ b/cpp/tools/parquet/parquet-reader.cc @@ -23,7 +23,7 @@ int main(int argc, char** argv) { if (argc > 5 || argc < 2) { - std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]" + std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json]" "[--print-key-value-metadata] [--columns=...] " << std::endl; return -1; diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp index 8e707e39e7cd8..8d2d5da904bab 100644 --- a/cpp/valgrind.supp +++ b/cpp/valgrind.supp @@ -21,4 +21,33 @@ Memcheck:Cond fun:*CastFunctor*BooleanType* } - +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:*llvm*PassManager* +} +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:*re2*RE2* +} +{ + :Use of uninitialised value of size 8 + Memcheck:Value8 + ... + fun:*re2*RE2* +} +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:*re2*Prog* +} +{ + :Use of uninitialised value of size 8 + Memcheck:Value8 + ... + fun:*re2*Prog* +} diff --git a/csharp/build/Common.props b/csharp/build/Common.props index 9e7901d8a109f..cebd07cf46157 100644 --- a/csharp/build/Common.props +++ b/csharp/build/Common.props @@ -2,7 +2,4 @@ ../../artifacts/$(AssemblyName) - - - \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs b/csharp/src/Apache.Arrow/Arrays/Array.cs index a9609f20f1210..e795ad9843717 100644 --- a/csharp/src/Apache.Arrow/Arrays/Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Array.cs @@ -33,8 +33,6 @@ protected Array(ArrayData data) public int NullCount => Data.NullCount; - public Bitmap NullBitmap => Data.NullBitmap; - public ArrowBuffer NullBitmapBuffer => Data.Buffers[0]; public virtual void Accept(IArrowArrayVisitor visitor) @@ -43,7 +41,7 @@ public virtual void Accept(IArrowArrayVisitor visitor) } public bool IsValid(int index) => - NullBitmapBuffer == null || NullBitmap.IsSet(index); + NullBitmapBuffer.IsEmpty || BitUtility.GetBit(NullBitmapBuffer.Span, index); public bool IsNull(int index) => !IsValid(index); @@ -51,13 +49,14 @@ public bool IsValid(int index) => internal static void Accept(T array, IArrowArrayVisitor visitor) where T : class, IArrowArray { - if (visitor is IArrowArrayVisitor v) - { - v.Visit(array); - } - else + switch (visitor) { - visitor.Visit(array); + case IArrowArrayVisitor typedVisitor: + typedVisitor.Visit(array); + break; + default: + visitor.Visit(array); + break; } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs index a8d745c66c150..2074f125dc616 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs @@ -19,7 +19,7 @@ namespace Apache.Arrow { - public class ArrayData + public sealed class ArrayData { public readonly IArrowType DataType; public readonly int Length; @@ -28,9 +28,6 @@ public class ArrayData public readonly ArrowBuffer[] Buffers; public readonly ArrayData[] Children; - public ArrowBuffer NullBitmapBuffer => Buffers[0]; - public Bitmap NullBitmap => NullBitmapBuffer; - public ArrayData( IArrowType dataType, int length, int nullCount = 0, int offset = 0, diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index cbe64dfc1bba2..12ef5ee7a7291 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -50,28 +50,31 @@ public BinaryArray(IArrowType dataType, int length, public ArrowBuffer ValueBuffer => Data.Buffers[2]; + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(0, Length + 1); + + public ReadOnlySpan Values => ValueBuffer.Span.CastTo(); + [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetValueOffset(int index) { - var offsets = ValueOffsetsBuffer.GetSpan(); - return offsets[Offset + index]; + return ValueOffsets[Offset + index]; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public int GetValueLength(int index) { - var offsets = ValueOffsetsBuffer.GetSpan(); + var offsets = ValueOffsets; var offset = Offset + index; + return offsets[offset + 1] - offsets[offset]; } - public ReadOnlySpan GetValue(int index) + public ReadOnlySpan GetBytes(int index) { var offset = GetValueOffset(index); var length = GetValueLength(index); - var values = ValueBuffer.GetSpan(); - - return values.Slice(offset, length); + + return ValueBuffer.Span.Slice(offset, length); } } diff --git a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs index f4197122e37c0..ddee188d98dc7 100644 --- a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs @@ -39,8 +39,7 @@ public BooleanArray(ArrayData data) if (IsNull(index)) return null; - var span = GetSpan(); - return BitUtility.GetBit(span, index); + return BitUtility.GetBit(Values, index); } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ListArray.cs b/csharp/src/Apache.Arrow/Arrays/ListArray.cs index e3872bceb0abb..3540f5afbc579 100644 --- a/csharp/src/Apache.Arrow/Arrays/ListArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/ListArray.cs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System; using Apache.Arrow.Types; namespace Apache.Arrow @@ -23,6 +24,8 @@ public class ListArray : Array public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(0, Length + 1); + public ListArray(IArrowType dataType, int length, ArrowBuffer valueOffsetsBuffer, IArrowArray values, ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0) @@ -43,14 +46,13 @@ public ListArray(ArrayData data) public int GetValueOffset(int index) { - var span = ValueOffsetsBuffer.GetSpan(Offset); - return span[index]; + return ValueOffsets[index]; } public int GetValueLength(int index) { - var span = ValueOffsetsBuffer.GetSpan(Offset); - return span[index + 1] - span[index]; + var offsets = ValueOffsets; + return offsets[index + 1] - offsets[index]; } } } diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs index 6dcb10333d2fd..617bddc50dd43 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs @@ -16,8 +16,6 @@ using System; using System.Collections.Generic; using System.Runtime.CompilerServices; -using Apache.Arrow.Memory; -using Apache.Arrow.Types; namespace Apache.Arrow { @@ -33,18 +31,17 @@ protected PrimitiveArray(ArrayData data) public ArrowBuffer ValueBuffer => Data.Buffers[1]; - public Span GetSpan() => ValueBuffer.GetSpan().Slice(0, Length); + public ReadOnlySpan Values => ValueBuffer.Span.CastTo().Slice(0, Length); [MethodImpl(MethodImplOptions.AggressiveInlining)] public T? GetValue(int index) { - var span = GetSpan(); - return IsValid(index) ? span[index] : (T?) null; + return IsValid(index) ? Values[index] : (T?) null; } public IList ToList(bool includeNulls = false) { - var span = GetSpan(); + var span = Values; var list = new List(span.Length); for (var i = 0; i < span.Length; i++) diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index 3c8f8c0599527..9ea9522b2bdb9 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -39,12 +39,12 @@ public string GetString(int index, Encoding encoding = default) { encoding = encoding ?? Encoding.UTF8; - var value = GetValue(index); + var bytes = GetBytes(index); unsafe { - fixed (byte* data = &MemoryMarshal.GetReference(value)) - return encoding.GetString(data, value.Length); + fixed (byte* data = &MemoryMarshal.GetReference(bytes)) + return encoding.GetString(data, bytes.Length); } } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs index 174f6500a1058..f9fd0aec275d2 100644 --- a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs @@ -38,14 +38,12 @@ public TimestampArray(ArrayData data) public DateTimeOffset? GetTimestamp(int index) { - var span = GetSpan(); - if (IsNull(index)) { return null; } - var value = span[index]; + var value = Values[index]; var type = Data.DataType as TimestampType; switch (type.Unit) @@ -60,7 +58,7 @@ public TimestampArray(ArrayData data) return DateTimeOffset.FromUnixTimeSeconds(value); default: throw new InvalidDataException( - string.Format("Unsupported timestamp unit <{0}>", type.Unit)); + $"Unsupported timestamp unit <{type.Unit}>"); } } } diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index 7ba7f9f5b8e27..8bccea2b59e31 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -28,9 +28,9 @@ public class UnionArray: Array public ArrowBuffer ValueOffsetBuffer => Data.Buffers[2]; - public ReadOnlySpan TypeIds => TypeBuffer.GetSpan(); + public ReadOnlySpan TypeIds => TypeBuffer.Span; - public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.GetSpan(); + public ReadOnlySpan ValueOffsets => ValueOffsetBuffer.Span.CastTo().Slice(0, Length + 1); public UnionArray(ArrayData data) : base(data) diff --git a/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs b/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs index a85fa2dc9d949..7ab26fac95aa3 100644 --- a/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs +++ b/csharp/src/Apache.Arrow/ArrowBuffer.Builder.cs @@ -17,70 +17,147 @@ using System; using System.Collections.Generic; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; namespace Apache.Arrow { - public partial class ArrowBuffer + public partial struct ArrowBuffer { - /// - /// Builds an Arrow buffer from primitive values. - /// - /// Primitive type public class Builder where T : struct { private readonly int _size; - private readonly MemoryPool _pool; - private Memory _memory; - private int _offset; + private byte[] _buffer; - public Builder(int initialCapacity = 8, MemoryPool pool = default) - { - if (initialCapacity <= 0) initialCapacity = 1; - if (pool == null) pool = DefaultMemoryPool.Instance.Value; + public int Capacity => _buffer.Length / _size; + public int Length { get; private set; } + public Builder(int capacity = 8) + { _size = Unsafe.SizeOf(); - _pool = pool; - _memory = _pool.Allocate(initialCapacity * _size); + _buffer = new byte[capacity * _size]; + + Length = 0; + } + + public Builder Append(ArrowBuffer buffer) + { + Append(buffer.Span.CastTo()); + return this; } public Builder Append(T value) { - var span = GetSpan(); + var span = EnsureCapacity(1); + span[Length++] = value; + return this; + } + + public Builder Append(ReadOnlySpan source) + { + var span = EnsureCapacity(source.Length); + source.CopyTo(span.Slice(Length, source.Length)); + Length += source.Length; + return this; + } - if (_offset + 1 >= span.Length) + public Builder Append(Func> fn) + { + if (fn != null) { - // TODO: Consider a specifiable growth strategy + AppendRange(fn()); + } - _memory = _pool.Reallocate(_memory, (_memory.Length * 3) / 2); + return this; + } + + public Builder AppendRange(IEnumerable values) + { + if (values != null) + { + foreach (var v in values) + { + Append(v); + } } - span[_offset++] = value; return this; } - public Builder Set(int index, T value) + public Builder Reserve(int capacity) { - var span = GetSpan(); - span[index] = value; + EnsureCapacity(capacity); + return this; + } + + public Builder Resize(int capacity) + { + if (capacity < 0) + { + throw new ArgumentOutOfRangeException(nameof(capacity)); + } + + Reallocate(capacity); + Length = Math.Min(Length, capacity); + return this; } public Builder Clear() { - var span = GetSpan(); - span.Fill(default); + Span.Fill(default); + Length = 0; return this; } - public ArrowBuffer Build() + public ArrowBuffer Build(MemoryPool pool = default) + { + var length = BitUtility.RoundUpToMultipleOf64(_buffer.Length); + var memoryPool = pool ?? MemoryPool.Default.Value; + var memory = memoryPool.Allocate(length); + + Memory.CopyTo(memory); + + return new ArrowBuffer(memory); + } + + private Span EnsureCapacity(int len) { - return new ArrowBuffer(_memory, _offset); + var targetCapacity = Length + len; + + if (targetCapacity > Capacity) + { + // TODO: specifiable growth strategy + + var capacity = Math.Max( + targetCapacity * _size, _buffer.Length * 2); + + Reallocate(capacity); + } + + return Span; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private Span GetSpan() => MemoryMarshal.Cast(_memory.Span); + private void Reallocate(int length) + { + if (length < 0) + { + throw new ArgumentOutOfRangeException(nameof(length)); + } + + if (length != 0) + { + System.Array.Resize(ref _buffer, length); + } + } + + private Memory Memory => _buffer; + + private Span Span + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Memory.Span.CastTo(); + } } + } } diff --git a/csharp/src/Apache.Arrow/ArrowBuffer.cs b/csharp/src/Apache.Arrow/ArrowBuffer.cs index ec2c3cbe8c123..8901ff93da55c 100644 --- a/csharp/src/Apache.Arrow/ArrowBuffer.cs +++ b/csharp/src/Apache.Arrow/ArrowBuffer.cs @@ -13,116 +13,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -using Apache.Arrow.Memory; using System; -using System.Buffers; -using System.IO; -using System.Runtime.InteropServices; -using System.Threading; -using System.Threading.Tasks; +using System.Runtime.CompilerServices; +using Apache.Arrow.Memory; namespace Apache.Arrow { - public partial class ArrowBuffer: IEquatable + public readonly partial struct ArrowBuffer: IEquatable { - public ArrowBuffer(Memory data, int size) + public static ArrowBuffer Empty => new ArrowBuffer(Memory.Empty); + + private ArrowBuffer(Memory data) { Memory = data; - Size = size; } - /// - /// Allocates an Arrow buffer from a memory pool. - /// - /// Size of buffer (in bytes) to allocate. - /// Memory pool to use for allocation. If null, a default memory pool is used. - /// - public static ArrowBuffer Allocate(int size, MemoryPool memoryPool = null) - { - if (memoryPool == null) - memoryPool = DefaultMemoryPool.Instance.Value; + public ReadOnlyMemory Memory { get; } - var buffer = memoryPool.Allocate(size); + public bool IsEmpty => Memory.IsEmpty; - return new ArrowBuffer(buffer, size); - } + public int Length => Memory.Length; - /// - /// Allocates an Arrow buffer the same length as the incoming data, then - /// copies the specified data to the arrow buffer. - /// - /// Data to copy into a new arrow buffer. - /// Memory pool to use for allocation. If null, a default memory pool is used. - /// - public static ArrowBuffer FromMemory(Memory data, MemoryPool memoryPool = default) + public ReadOnlySpan Span { - var buffer = Allocate(data.Length, memoryPool); - data.CopyTo(buffer.Memory); - return buffer; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Memory.Span; } - public async Task CopyToAsync(Stream stream, CancellationToken cancellationToken = default) + public ArrowBuffer Clone(MemoryPool pool = default) { - const float chunkSize = 8192f; - - // TODO: Is there a better copy mechanism to use here that does not involve allocating buffers and targets .NET Standard 1.3? - // NOTE: Consider specialization for .NET Core 2.1 - - var length = Convert.ToInt32(chunkSize); - var buffer = ArrayPool.Shared.Rent(length); - var count = Convert.ToInt32(Math.Ceiling(Memory.Length / chunkSize)); - var offset = 0; - - try - { - for (var i = 0; i < count; i++) - { - var n = Math.Min(length, Memory.Length); - var slice = Memory.Slice(offset, n); - - slice.CopyTo(buffer); - - await stream.WriteAsync(buffer, 0, n, cancellationToken); - - offset += n; - } - } - finally - { - if (buffer != null) - { - ArrayPool.Shared.Return(buffer); - } - } + return new Builder(Span.Length) + .Append(Span) + .Build(pool); } - public Memory Memory { get; } - - public bool IsEmpty => Memory.IsEmpty; - - public int Size { get; } - - public int Capacity => Memory.Length; - - public Span GetSpan(int offset) - where T : struct => - MemoryMarshal.Cast( - Memory.Span.Slice(offset)); - - public Span GetSpan(int offset, int length) - where T : struct => - MemoryMarshal.Cast( - Memory.Span.Slice(offset, length)); - - public Span GetSpan() - where T: struct => - MemoryMarshal.Cast(Memory.Span); - public bool Equals(ArrowBuffer other) { - var lhs = GetSpan(); - var rhs = other.GetSpan(); - return lhs.SequenceEqual(rhs); + return Span.SequenceEqual(other.Span); } } } diff --git a/csharp/src/Apache.Arrow/BitUtility.cs b/csharp/src/Apache.Arrow/BitUtility.cs index ea5a556162e53..3b4ee7a43d268 100644 --- a/csharp/src/Apache.Arrow/BitUtility.cs +++ b/csharp/src/Apache.Arrow/BitUtility.cs @@ -20,8 +20,7 @@ namespace Apache.Arrow { public static class BitUtility { - private static readonly byte[] PopcountTable = new byte[] - { + private static readonly byte[] PopcountTable = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, @@ -32,8 +31,7 @@ public static class BitUtility 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; - private static readonly byte[] BitMask = new byte[] - { + private static readonly byte[] BitMask = { 1, 2, 4, 8, 16, 32, 64, 128 }; diff --git a/csharp/src/Apache.Arrow/Bitmap.cs b/csharp/src/Apache.Arrow/Bitmap.cs deleted file mode 100644 index 257438b323c7d..0000000000000 --- a/csharp/src/Apache.Arrow/Bitmap.cs +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -using Apache.Arrow.Memory; - -namespace Apache.Arrow -{ - public struct Bitmap - { - public ArrowBuffer Buffer { get; } - - public int Length => Buffer.Size; - - public Bitmap(ArrowBuffer buffer) - { - Buffer = buffer; - } - - public static implicit operator Bitmap(ArrowBuffer buffer) - { - return new Bitmap(buffer); - } - - public static implicit operator ArrowBuffer(Bitmap bitmap) - { - return bitmap.Buffer; - } - - public static Bitmap Allocate(int bitCount, MemoryPool memoryPool = default) - { - var size = bitCount / 8 + (bitCount % 8 > 0 ? 1 : 0); - var remainder = size % 64; - var len = (remainder == 0) ? size : size + 64 - remainder; - - // Allocate buffer from memory pool and enable all bits - - var buffer = ArrowBuffer.Allocate(len, memoryPool); - var span = buffer.GetSpan(); - - span.Fill(0xff); - - return new Bitmap(buffer); - } - - public void Clear(int index) - { - BitUtility.ClearBit( - Buffer.GetSpan(), index); - } - - public void Set(int index) - { - BitUtility.SetBit( - Buffer.GetSpan(), index); - } - - public bool IsSet(int index) - { - return BitUtility.GetBit( - Buffer.GetSpan(), index); - } - } -} diff --git a/csharp/src/Apache.Arrow/Memory/DefaultMemoryPool.cs b/csharp/src/Apache.Arrow/Extensions/SpanExtensions.cs similarity index 59% rename from csharp/src/Apache.Arrow/Memory/DefaultMemoryPool.cs rename to csharp/src/Apache.Arrow/Extensions/SpanExtensions.cs index bf6597918490e..b759f38060703 100644 --- a/csharp/src/Apache.Arrow/Memory/DefaultMemoryPool.cs +++ b/csharp/src/Apache.Arrow/Extensions/SpanExtensions.cs @@ -14,25 +14,18 @@ // limitations under the License. using System; -using System.Buffers; using System.Runtime.InteropServices; -namespace Apache.Arrow.Memory +namespace Apache.Arrow { - public class DefaultMemoryPool + public static class SpanExtensions { - public const int DefaultAlignment = 64; - public const int DefaultPadding = 8; - - public static readonly Lazy Instance = new Lazy(BuildDefault, true); - - private static MemoryPool BuildDefault() - { - // TODO: Replace the default memory pool instance with a platform-specific implementation - // of memory pool with fallback to this implementation? - - return new NativeMemoryPool(DefaultPadding, DefaultAlignment); - } + public static Span CastTo(this Span span) + where T: struct => + MemoryMarshal.Cast(span); + public static ReadOnlySpan CastTo(this ReadOnlySpan span) + where T: struct => + MemoryMarshal.Cast(span); } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowFileReader.cs b/csharp/src/Apache.Arrow/Ipc/ArrowFileReader.cs index c47eab5884d45..61c7627f0769b 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowFileReader.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowFileReader.cs @@ -13,14 +13,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +using FlatBuffers; using System; -using System.Buffers; using System.Buffers.Binary; using System.IO; -using System.Threading.Tasks; -using FlatBuffers; -using System.Threading; using System.Linq; +using System.Threading; +using System.Threading.Tasks; namespace Apache.Arrow.Ipc { diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs index dac7b5bee21fa..98fbdf0be312d 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowFileWriter.cs @@ -16,7 +16,6 @@ using System; using System.Buffers.Binary; using System.Collections.Generic; -using System.Diagnostics; using System.IO; using System.Threading; using System.Threading.Tasks; @@ -38,6 +37,8 @@ public ArrowFileWriter(Stream stream, Schema schema) throw new ArgumentException("stream must be writable", nameof(stream)); } + // TODO: Remove seek requirement + if (!stream.CanSeek) { throw new ArgumentException("stream must be seekable", nameof(stream)); @@ -66,7 +67,7 @@ public override async Task WriteRecordBatchAsync(RecordBatch recordBatch, Cancel RecordBatchBlocks.Add(block); } - public async Task CloseAsync(CancellationToken cancellationToken = default) + public async Task WriteFooterAsync(CancellationToken cancellationToken = default) { if (!HasWrittenFooter) { @@ -77,19 +78,6 @@ public async Task CloseAsync(CancellationToken cancellationToken = default) await BaseStream.FlushAsync(cancellationToken); } - public override void Dispose() - { - try - { - CloseAsync().GetAwaiter().GetResult(); - } - catch(Exception ex) - { - // NOTE: Dispose shouldn't throw. - Debug.WriteLine(ex); - } - } - private async Task WriteHeaderAsync(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); @@ -144,10 +132,12 @@ private async Task WriteFooterAsync(Schema schema, CancellationToken cancellatio cancellationToken.ThrowIfCancellationRequested(); - Buffers.RentReturn(4, (buffer) => + await Buffers.RentReturnAsync(4, async (buffer) => { BinaryPrimitives.WriteInt32LittleEndian(buffer, Convert.ToInt32(BaseStream.Position - offset)); + + await BaseStream.WriteAsync(buffer, 0, 4, cancellationToken); }); // Write magic diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamReader.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamReader.cs index 18e254c805b0e..f6e1ca5d06971 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamReader.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamReader.cs @@ -219,11 +219,14 @@ private static ArrowBuffer BuildArrowBuffer(FlatBuffers.ByteBuffer bodyData, Fla { if (buffer.Length <= 0) { - return null; + return ArrowBuffer.Empty; } var segment = bodyData.ToArraySegment((int)buffer.Offset, (int)buffer.Length); - return ArrowBuffer.FromMemory(segment); + + return new ArrowBuffer.Builder(segment.Count) + .Append(segment) + .Build(); } private static ArrayData LoadPrimitiveField(Field field, diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 980a42dbf5f19..639c64a0363d3 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -60,16 +60,15 @@ public Buffer(ArrowBuffer buffer, int offset, int length) } private readonly List _buffers; - private int _offset; public IReadOnlyList Buffers => _buffers; - public int TotalLength => _offset; + public int TotalLength { get; private set; } public ArrowRecordBatchFlatBufferBuilder() { _buffers = new List(); - _offset = 0; + TotalLength = 0; } public void Visit(Int8Array array) => CreateBuffers(array); @@ -113,16 +112,11 @@ private void CreateBuffers(PrimitiveArray array) private Buffer CreateBuffer(ArrowBuffer buffer) { - if (buffer == null) - { - return new Buffer(null, _offset, 0); - } - - var offset = _offset; + var offset = TotalLength; - _offset += buffer.Capacity; + TotalLength += buffer.Length; - return new Buffer(buffer, offset, buffer.Capacity); + return new Buffer(buffer, offset, buffer.Length); } public void Visit(IArrowArray array) @@ -176,6 +170,8 @@ public ArrowStreamWriter(Stream baseStream, Schema schema) protected virtual async Task WriteRecordBatchInternalAsync(RecordBatch recordBatch, CancellationToken cancellationToken = default) { + // TODO: Truncate buffers with extraneous padding / unused capacity + if (!HasWrittenSchema) { await WriteSchemaAsync(Schema, cancellationToken); @@ -243,10 +239,11 @@ await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, for (var i = 0; i < buffers.Count; i++) { - if (buffers[i].DataBuffer == null) + if (buffers[i].DataBuffer.IsEmpty) continue; - await buffers[i].DataBuffer.CopyToAsync(BaseStream, cancellationToken); + + await WriteBufferAsync(buffers[i].DataBuffer, cancellationToken); } // Write padding so the record batch message body length is a multiple of 8 bytes @@ -257,7 +254,7 @@ await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, await WritePaddingAsync(bodyPaddingLength); return new Block( - offset: Convert.ToInt32(metadataOffset), + offset: Convert.ToInt32(metadataOffset), length: bodyLength + bodyPaddingLength, metadataLength: Convert.ToInt32(metadataLength)); } @@ -266,6 +263,22 @@ public virtual Task WriteRecordBatchAsync(RecordBatch recordBatch, CancellationT { return WriteRecordBatchInternalAsync(recordBatch, cancellationToken); } + + public Task WriteBufferAsync(ArrowBuffer arrowBuffer, CancellationToken cancellationToken = default) + { + byte[] buffer = null; + try + { + var span = arrowBuffer.Span; + buffer = ArrayPool.Shared.Rent(span.Length); + span.CopyTo(buffer); + return BaseStream.WriteAsync(buffer, 0, buffer.Length, cancellationToken); + } + finally + { + ArrayPool.Shared.Return(buffer); + } + } protected Offset SerializeSchema(Schema schema) { diff --git a/csharp/src/Apache.Arrow/Memory/MemoryPool.cs b/csharp/src/Apache.Arrow/Memory/MemoryPool.cs index 1e2c173975b2b..569ca7439f83b 100644 --- a/csharp/src/Apache.Arrow/Memory/MemoryPool.cs +++ b/csharp/src/Apache.Arrow/Memory/MemoryPool.cs @@ -21,6 +21,10 @@ namespace Apache.Arrow.Memory public abstract class MemoryPool { + public const int DefaultAlignment = 64; + + public static Lazy Default { get; } = new Lazy(BuildDefault, true); + public class Stats { private long _bytesAllocated; @@ -38,9 +42,12 @@ internal void Allocate(int n) public Stats Statistics { get; } - protected MemoryPool() + protected int Alignment { get; } + + protected MemoryPool(int alignment = DefaultAlignment) { Statistics = new Stats(); + Alignment = alignment; } public Memory Allocate(int length) @@ -50,14 +57,18 @@ public Memory Allocate(int length) throw new ArgumentOutOfRangeException(nameof(length)); } - var bytesAllocated = 0; - var memory = AllocateInternal(length, out bytesAllocated); + if (length == 0) + { + return Memory.Empty; + } + + var memory = AllocateInternal(length, out var bytesAllocated); - Statistics.Allocate(length); + Statistics.Allocate(bytesAllocated); // Ensure all allocated memory is zeroed. - ZeroMemory(memory); + ZeroMemory(memory.Span); return memory; } @@ -69,18 +80,32 @@ public Memory Reallocate(Memory memory, int length) throw new ArgumentOutOfRangeException(nameof(length)); } - var bytesAllocated = 0; - var buffer = ReallocateInternal(memory, length, out bytesAllocated); + if (length == 0) + { + return Memory.Empty; + } + + var buffer = ReallocateInternal(memory, length, out var bytesAllocated); Statistics.Allocate(bytesAllocated); + if (length > memory.Length) + { + ZeroMemory(buffer.Span.Slice( + memory.Length, length - memory.Length)); + } + return buffer; + } + private static void ZeroMemory(Span span) + { + span.Fill(0); } - private static void ZeroMemory(Memory memory) + private static MemoryPool BuildDefault() { - memory.Span.Fill(0); + return new NativeMemoryPool(DefaultAlignment); } protected abstract Memory AllocateInternal(int length, out int bytesAllocated); diff --git a/csharp/src/Apache.Arrow/Memory/NativeMemory.cs b/csharp/src/Apache.Arrow/Memory/NativeMemory.cs index c5e29aa2a2318..a188f453eaf02 100644 --- a/csharp/src/Apache.Arrow/Memory/NativeMemory.cs +++ b/csharp/src/Apache.Arrow/Memory/NativeMemory.cs @@ -25,8 +25,8 @@ namespace Apache.Arrow.Memory public class NativeMemoryManager: MemoryManager { private IntPtr _ptr; - private int _offset; - private int _length; + private readonly int _offset; + private readonly int _length; public NativeMemoryManager(IntPtr ptr, int offset, int length) { @@ -40,13 +40,13 @@ public NativeMemoryManager(IntPtr ptr, int offset, int length) Dispose(false); } - public unsafe override Span GetSpan() + public override unsafe Span GetSpan() { var ptr = CalculatePointer(0); return new Span(ptr, _length); } - public unsafe override MemoryHandle Pin(int elementIndex = 0) + public override unsafe MemoryHandle Pin(int elementIndex = 0) { // NOTE: Unmanaged memory doesn't require GC pinning because by definition it's not // managed by the garbage collector. diff --git a/csharp/src/Apache.Arrow/Memory/NativeMemoryPool.cs b/csharp/src/Apache.Arrow/Memory/NativeMemoryPool.cs index 9413951c39161..2ea07ce451653 100644 --- a/csharp/src/Apache.Arrow/Memory/NativeMemoryPool.cs +++ b/csharp/src/Apache.Arrow/Memory/NativeMemoryPool.cs @@ -20,19 +20,8 @@ namespace Apache.Arrow.Memory { public class NativeMemoryPool : MemoryPool { - private readonly int _padding; - private readonly int _alignment; - - public NativeMemoryPool(int padding, int alignment) - { - if (padding < 0) throw new ArgumentOutOfRangeException(nameof(padding)); - if (alignment < 0) throw new ArgumentOutOfRangeException(nameof(alignment)); - - // TODO: Ensure alignment is a power of two. - - _padding = padding; - _alignment = alignment; - } + public NativeMemoryPool(int alignment = DefaultAlignment) + : base(alignment) { } protected override Memory AllocateInternal(int length, out int bytesAllocated) { @@ -42,14 +31,13 @@ protected override Memory AllocateInternal(int length, out int bytesAlloca // to allocated memory, offset, and the allocation size. // TODO: Should the allocation be moved to NativeMemory? - - var size = BitUtility.RoundUpToMultiplePowerOfTwo(length, _padding); - var ptr = Marshal.AllocHGlobal(size + _alignment); - var offset = (int)(_alignment - (ptr.ToInt64() & (_alignment - 1))); - - var manager = new NativeMemoryManager(ptr, offset, size); - bytesAllocated = (size + _alignment); + var size = length + Alignment; + var ptr = Marshal.AllocHGlobal(size); + var offset = (int)(Alignment - (ptr.ToInt64() & (Alignment - 1))); + var manager = new NativeMemoryManager(ptr, offset, length); + + bytesAllocated = (length + Alignment); GC.AddMemoryPressure(bytesAllocated); diff --git a/csharp/src/Apache.Arrow/Types/ArrowType.cs b/csharp/src/Apache.Arrow/Types/ArrowType.cs index 9e4b3608b9771..c0eca23da55ac 100644 --- a/csharp/src/Apache.Arrow/Types/ArrowType.cs +++ b/csharp/src/Apache.Arrow/Types/ArrowType.cs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. + namespace Apache.Arrow.Types { public abstract class ArrowType: IArrowType @@ -24,5 +25,19 @@ public abstract class ArrowType: IArrowType public virtual bool IsFixedWidth => false; public abstract void Accept(IArrowTypeVisitor visitor); + + internal static void Accept(T type, IArrowTypeVisitor visitor) + where T: class, IArrowType + { + switch (visitor) + { + case IArrowTypeVisitor typedVisitor: + typedVisitor.Visit(type); + break; + default: + visitor.Visit(type); + break; + } + } } } diff --git a/csharp/src/Apache.Arrow/Types/BinaryType.cs b/csharp/src/Apache.Arrow/Types/BinaryType.cs index 8ae753360c0b3..6734d93ad2e7a 100644 --- a/csharp/src/Apache.Arrow/Types/BinaryType.cs +++ b/csharp/src/Apache.Arrow/Types/BinaryType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { @@ -26,10 +23,6 @@ public class BinaryType: ArrowType public override ArrowTypeId TypeId => ArrowTypeId.Binary; public override string Name => "binary"; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/BooleanType.cs b/csharp/src/Apache.Arrow/Types/BooleanType.cs index 5a26c879f5fa8..3b57414b0179f 100644 --- a/csharp/src/Apache.Arrow/Types/BooleanType.cs +++ b/csharp/src/Apache.Arrow/Types/BooleanType.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class BooleanType: NumberType + public sealed class BooleanType: NumberType { public static readonly BooleanType Default = new BooleanType(); @@ -28,10 +25,6 @@ public class BooleanType: NumberType public override int BitWidth => 1; public override bool IsSigned => false; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/Date32Type.cs b/csharp/src/Apache.Arrow/Types/Date32Type.cs index 19e6823602a6b..9673bf62ff6e7 100644 --- a/csharp/src/Apache.Arrow/Types/Date32Type.cs +++ b/csharp/src/Apache.Arrow/Types/Date32Type.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class Date32Type: DateType + public sealed class Date32Type: DateType { public static readonly Date32Type Default = new Date32Type(); @@ -28,10 +25,6 @@ public class Date32Type: DateType public override int BitWidth => 32; public override DateUnit Unit => DateUnit.Day; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/Date64Type.cs b/csharp/src/Apache.Arrow/Types/Date64Type.cs index d301ca4130d56..2a9e1aac0d6ea 100644 --- a/csharp/src/Apache.Arrow/Types/Date64Type.cs +++ b/csharp/src/Apache.Arrow/Types/Date64Type.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class Date64Type: DateType + public sealed class Date64Type : DateType { public static readonly Date64Type Default = new Date64Type(); @@ -28,10 +25,6 @@ public class Date64Type: DateType public override int BitWidth => 64; public override DateUnit Unit => DateUnit.Milliseconds; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/DateType.cs b/csharp/src/Apache.Arrow/Types/DateType.cs index 1fa2a32294b9f..8f15b08fc88e5 100644 --- a/csharp/src/Apache.Arrow/Types/DateType.cs +++ b/csharp/src/Apache.Arrow/Types/DateType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/DecimalType.cs b/csharp/src/Apache.Arrow/Types/DecimalType.cs index adb9a06d1a1b5..cad2e2428727b 100644 --- a/csharp/src/Apache.Arrow/Types/DecimalType.cs +++ b/csharp/src/Apache.Arrow/Types/DecimalType.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class DecimalType: FixedSizeBinaryType + public sealed class DecimalType: FixedSizeBinaryType { public override ArrowTypeId TypeId => ArrowTypeId.Decimal; public override string Name => "decimal"; diff --git a/csharp/src/Apache.Arrow/Types/DoubleType.cs b/csharp/src/Apache.Arrow/Types/DoubleType.cs index 9fb0969130cc2..aa6ade650ef89 100644 --- a/csharp/src/Apache.Arrow/Types/DoubleType.cs +++ b/csharp/src/Apache.Arrow/Types/DoubleType.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class DoubleType: FloatingPointType + public sealed class DoubleType: FloatingPointType { public static readonly DoubleType Default = new DoubleType(); @@ -29,10 +26,6 @@ public class DoubleType: FloatingPointType public override bool IsSigned => true; public override PrecisionKind Precision => PrecisionKind.Double; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/FixedSizeBinaryType.cs b/csharp/src/Apache.Arrow/Types/FixedSizeBinaryType.cs index 6e16730314ffa..ccbfc8c1fe7f6 100644 --- a/csharp/src/Apache.Arrow/Types/FixedSizeBinaryType.cs +++ b/csharp/src/Apache.Arrow/Types/FixedSizeBinaryType.cs @@ -14,12 +14,10 @@ // limitations under the License. using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class FixedSizeBinaryType: FixedWidthType + public class FixedSizeBinaryType : FixedWidthType { public override ArrowTypeId TypeId => ArrowTypeId.FixedSizedBinary; public override string Name => "fixed_size_binary"; @@ -34,12 +32,7 @@ public FixedSizeBinaryType(int byteWidth) ByteWidth = byteWidth; } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); - } } diff --git a/csharp/src/Apache.Arrow/Types/FixedWidthType.cs b/csharp/src/Apache.Arrow/Types/FixedWidthType.cs index 72b46a2b117e2..d1c9e8c1d8b8d 100644 --- a/csharp/src/Apache.Arrow/Types/FixedWidthType.cs +++ b/csharp/src/Apache.Arrow/Types/FixedWidthType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/FloatType.cs b/csharp/src/Apache.Arrow/Types/FloatType.cs index c4232931d01e5..a3f7b39bf49ff 100644 --- a/csharp/src/Apache.Arrow/Types/FloatType.cs +++ b/csharp/src/Apache.Arrow/Types/FloatType.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class FloatType: FloatingPointType + public sealed class FloatType: FloatingPointType { public static readonly FloatType Default = new FloatType(); @@ -29,10 +26,6 @@ public class FloatType: FloatingPointType public override bool IsSigned => true; public override PrecisionKind Precision => PrecisionKind.Single; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/FloatingPointType.cs b/csharp/src/Apache.Arrow/Types/FloatingPointType.cs index 5f667c72226ae..9fbe43a99b6b2 100644 --- a/csharp/src/Apache.Arrow/Types/FloatingPointType.cs +++ b/csharp/src/Apache.Arrow/Types/FloatingPointType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/HalfFloatType.cs b/csharp/src/Apache.Arrow/Types/HalfFloatType.cs index 22f1370af5f54..5bfa232dc18d5 100644 --- a/csharp/src/Apache.Arrow/Types/HalfFloatType.cs +++ b/csharp/src/Apache.Arrow/Types/HalfFloatType.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class HalfFloatType: FloatingPointType + public sealed class HalfFloatType: FloatingPointType { public static readonly HalfFloatType Default = new HalfFloatType(); @@ -29,10 +26,6 @@ public class HalfFloatType: FloatingPointType public override bool IsSigned => true; public override PrecisionKind Precision => PrecisionKind.Half; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/IArrowType.cs b/csharp/src/Apache.Arrow/Types/IArrowType.cs index d75be542a9237..578e18b9dedf5 100644 --- a/csharp/src/Apache.Arrow/Types/IArrowType.cs +++ b/csharp/src/Apache.Arrow/Types/IArrowType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/IArrowTypeVisitor.cs b/csharp/src/Apache.Arrow/Types/IArrowTypeVisitor.cs index 3cd602ba18863..ce5b114bf3178 100644 --- a/csharp/src/Apache.Arrow/Types/IArrowTypeVisitor.cs +++ b/csharp/src/Apache.Arrow/Types/IArrowTypeVisitor.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/Int16Type.cs b/csharp/src/Apache.Arrow/Types/Int16Type.cs index 3a7edbe0bd5a7..f1d6868ba8ae1 100644 --- a/csharp/src/Apache.Arrow/Types/Int16Type.cs +++ b/csharp/src/Apache.Arrow/Types/Int16Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class Int16Type : NumberType + public sealed class Int16Type : NumberType { public static readonly Int16Type Default = new Int16Type(); @@ -24,10 +24,6 @@ public class Int16Type : NumberType public override int BitWidth => 16; public override bool IsSigned => true; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/Int32Type.cs b/csharp/src/Apache.Arrow/Types/Int32Type.cs index e8df522679a0f..a32c884629831 100644 --- a/csharp/src/Apache.Arrow/Types/Int32Type.cs +++ b/csharp/src/Apache.Arrow/Types/Int32Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class Int32Type : NumberType + public sealed class Int32Type : NumberType { public static readonly Int32Type Default = new Int32Type(); @@ -24,10 +24,6 @@ public class Int32Type : NumberType public override int BitWidth => 32; public override bool IsSigned => true; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/Int64Type.cs b/csharp/src/Apache.Arrow/Types/Int64Type.cs index afdf4098861de..f45523cfb3303 100644 --- a/csharp/src/Apache.Arrow/Types/Int64Type.cs +++ b/csharp/src/Apache.Arrow/Types/Int64Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class Int64Type : NumberType + public sealed class Int64Type : NumberType { public static readonly Int64Type Default = new Int64Type(); @@ -24,10 +24,6 @@ public class Int64Type : NumberType public override int BitWidth => 64; public override bool IsSigned => true; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/Int8Type.cs b/csharp/src/Apache.Arrow/Types/Int8Type.cs index 9687cd349cc3d..9b3f5b5b4fc96 100644 --- a/csharp/src/Apache.Arrow/Types/Int8Type.cs +++ b/csharp/src/Apache.Arrow/Types/Int8Type.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class Int8Type: NumberType + public sealed class Int8Type : NumberType { public static readonly Int8Type Default = new Int8Type(); @@ -28,10 +25,6 @@ public class Int8Type: NumberType public override int BitWidth => 8; public override bool IsSigned => true; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/IntervalUnit.cs b/csharp/src/Apache.Arrow/Types/IntervalUnit.cs index e287548ef6206..6dda0cfe94b78 100644 --- a/csharp/src/Apache.Arrow/Types/IntervalUnit.cs +++ b/csharp/src/Apache.Arrow/Types/IntervalUnit.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { @@ -25,7 +22,7 @@ public enum IntervalUnit DayTime = 1 } - public class IntervalType: FixedWidthType + public sealed class IntervalType : FixedWidthType { public override ArrowTypeId TypeId => ArrowTypeId.Interval; public override string Name => "date"; @@ -38,10 +35,6 @@ public IntervalType(IntervalUnit unit = IntervalUnit.YearMonth) Unit = unit; } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/ListType.cs b/csharp/src/Apache.Arrow/Types/ListType.cs index fa5c87cd4eb56..5d48a610feab1 100644 --- a/csharp/src/Apache.Arrow/Types/ListType.cs +++ b/csharp/src/Apache.Arrow/Types/ListType.cs @@ -14,12 +14,10 @@ // limitations under the License. using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class ListType: ArrowType + public sealed class ListType : ArrowType { public override ArrowTypeId TypeId => ArrowTypeId.List; public override string Name => "list"; @@ -33,10 +31,6 @@ public ListType(Field valueField, IArrowType valueDataType) ValueDataType = valueDataType ?? NullType.Default; } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/NullType.cs b/csharp/src/Apache.Arrow/Types/NullType.cs index b299ed8c93836..4afe1dc38a300 100644 --- a/csharp/src/Apache.Arrow/Types/NullType.cs +++ b/csharp/src/Apache.Arrow/Types/NullType.cs @@ -13,23 +13,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class NullType: ArrowType + public sealed class NullType : ArrowType { public static readonly NullType Default = new NullType(); public override ArrowTypeId TypeId => ArrowTypeId.Null; public override string Name => "null"; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/NumberType.cs b/csharp/src/Apache.Arrow/Types/NumberType.cs index a80bd44fe4312..04d21bc8cfb1b 100644 --- a/csharp/src/Apache.Arrow/Types/NumberType.cs +++ b/csharp/src/Apache.Arrow/Types/NumberType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/StringType.cs b/csharp/src/Apache.Arrow/Types/StringType.cs index f2bb0822f8039..33620aad9e9c5 100644 --- a/csharp/src/Apache.Arrow/Types/StringType.cs +++ b/csharp/src/Apache.Arrow/Types/StringType.cs @@ -13,23 +13,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class StringType: ArrowType + public sealed class StringType : ArrowType { public static StringType Default = new StringType(); public override ArrowTypeId TypeId => ArrowTypeId.String; public override string Name => "utf8"; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/StructType.cs b/csharp/src/Apache.Arrow/Types/StructType.cs index f9b9e0ce1dfdf..fb074c101309d 100644 --- a/csharp/src/Apache.Arrow/Types/StructType.cs +++ b/csharp/src/Apache.Arrow/Types/StructType.cs @@ -19,7 +19,7 @@ namespace Apache.Arrow.Types { - public class StructType: ArrowType + public sealed class StructType : ArrowType { private readonly List _fields; @@ -55,10 +55,6 @@ public int GetFieldIndex(string name, field => comparer.Equals(field.Name, name)); } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/Time32Type.cs b/csharp/src/Apache.Arrow/Types/Time32Type.cs index 70cfe509727e3..99c409babdb26 100644 --- a/csharp/src/Apache.Arrow/Types/Time32Type.cs +++ b/csharp/src/Apache.Arrow/Types/Time32Type.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class Time32Type: TimeType + public sealed class Time32Type : TimeType { public static readonly Time32Type Default = new Time32Type(); @@ -30,10 +27,6 @@ public class Time32Type: TimeType public Time32Type(TimeUnit unit = TimeUnit.Millisecond) : base(unit) { } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/Time64Type.cs b/csharp/src/Apache.Arrow/Types/Time64Type.cs index 3f727859c60b0..5d6c2e46e1b56 100644 --- a/csharp/src/Apache.Arrow/Types/Time64Type.cs +++ b/csharp/src/Apache.Arrow/Types/Time64Type.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class Time64Type: TimeType + public sealed class Time64Type : TimeType { public static readonly Time64Type Default = new Time64Type(); @@ -30,10 +27,6 @@ public class Time64Type: TimeType public Time64Type(TimeUnit unit = TimeUnit.Millisecond) : base(unit) { } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/TimeType.cs b/csharp/src/Apache.Arrow/Types/TimeType.cs index dba488b4ea706..9afa3fb62cdc7 100644 --- a/csharp/src/Apache.Arrow/Types/TimeType.cs +++ b/csharp/src/Apache.Arrow/Types/TimeType.cs @@ -13,9 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { diff --git a/csharp/src/Apache.Arrow/Types/TimestampType.cs b/csharp/src/Apache.Arrow/Types/TimestampType.cs index 22da8328dc527..4137818232c19 100644 --- a/csharp/src/Apache.Arrow/Types/TimestampType.cs +++ b/csharp/src/Apache.Arrow/Types/TimestampType.cs @@ -13,13 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Types { - public class TimestampType: FixedWidthType + public sealed class TimestampType : FixedWidthType { public static readonly TimestampType Default = new TimestampType(TimeUnit.Millisecond, "UTC"); @@ -38,10 +35,6 @@ public TimestampType( Timezone = timezone; } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/src/Apache.Arrow/Types/UInt16Type.cs b/csharp/src/Apache.Arrow/Types/UInt16Type.cs index eb87729cf52cf..1925ffb86b790 100644 --- a/csharp/src/Apache.Arrow/Types/UInt16Type.cs +++ b/csharp/src/Apache.Arrow/Types/UInt16Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class UInt16Type : NumberType + public sealed class UInt16Type : NumberType { public static readonly UInt16Type Default = new UInt16Type(); @@ -24,10 +24,6 @@ public class UInt16Type : NumberType public override int BitWidth => 16; public override bool IsSigned => false; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/UInt32Type.cs b/csharp/src/Apache.Arrow/Types/UInt32Type.cs index e520004470885..8007025f30618 100644 --- a/csharp/src/Apache.Arrow/Types/UInt32Type.cs +++ b/csharp/src/Apache.Arrow/Types/UInt32Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class UInt32Type : NumberType + public sealed class UInt32Type : NumberType { public static readonly UInt32Type Default = new UInt32Type(); @@ -24,10 +24,6 @@ public class UInt32Type : NumberType public override int BitWidth => 32; public override bool IsSigned => false; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/UInt64Type.cs b/csharp/src/Apache.Arrow/Types/UInt64Type.cs index 45c6fac056833..20b51ad44f548 100644 --- a/csharp/src/Apache.Arrow/Types/UInt64Type.cs +++ b/csharp/src/Apache.Arrow/Types/UInt64Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class UInt64Type : NumberType + public sealed class UInt64Type : NumberType { public static readonly UInt64Type Default = new UInt64Type(); @@ -24,10 +24,6 @@ public class UInt64Type : NumberType public override int BitWidth => 64; public override bool IsSigned => false; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/UInt8Type.cs b/csharp/src/Apache.Arrow/Types/UInt8Type.cs index d63e42b2039be..e2e53657200ec 100644 --- a/csharp/src/Apache.Arrow/Types/UInt8Type.cs +++ b/csharp/src/Apache.Arrow/Types/UInt8Type.cs @@ -15,7 +15,7 @@ namespace Apache.Arrow.Types { - public class UInt8Type : NumberType + public sealed class UInt8Type : NumberType { public static readonly UInt8Type Default = new UInt8Type(); @@ -24,10 +24,6 @@ public class UInt8Type : NumberType public override int BitWidth => 8; public override bool IsSigned => false; - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } \ No newline at end of file diff --git a/csharp/src/Apache.Arrow/Types/UnionType.cs b/csharp/src/Apache.Arrow/Types/UnionType.cs index aadb1e7fbbe49..293271018aa26 100644 --- a/csharp/src/Apache.Arrow/Types/UnionType.cs +++ b/csharp/src/Apache.Arrow/Types/UnionType.cs @@ -24,7 +24,7 @@ public enum UnionMode Dense } - public class UnionType: ArrowType + public sealed class UnionType : ArrowType { public override ArrowTypeId TypeId => ArrowTypeId.Union; public override string Name => "union"; @@ -41,10 +41,6 @@ public UnionType( Mode = mode; } - public override void Accept(IArrowTypeVisitor visitor) - { - if (visitor is IArrowTypeVisitor v) - v.Visit(this); - } + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); } } diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index dca8e2d819967..d29279b330a9b 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -6,6 +6,7 @@ netcoreapp2.1 true + 7.3 diff --git a/csharp/test/Apache.Arrow.Tests/ArrowBufferBuilderTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowBufferBuilderTests.cs new file mode 100644 index 0000000000000..eee4d14b39469 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/ArrowBufferBuilderTests.cs @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Linq; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class ArrowBufferBuilderTests + { + public class Append + { + + [Fact] + public void DoesNotThrowWithNullParameters() + { + var builder = new ArrowBuffer.Builder(); + + builder.AppendRange(null); + builder.Append((Func>) null); + } + + [Fact] + public void CapacityOnlyGrowsWhenLengthWillExceedCapacity() + { + var builder = new ArrowBuffer.Builder(1); + var capacity = builder.Capacity; + + builder.Append(1); + + Assert.Equal(capacity, builder.Capacity); + } + + [Fact] + public void CapacityGrowsAfterAppendWhenLengthExceedsCapacity() + { + var builder = new ArrowBuffer.Builder(1); + var capacity = builder.Capacity; + + builder.Append(1); + builder.Append(2); + + Assert.True(builder.Capacity > capacity); + } + + [Fact] + public void CapacityGrowsAfterAppendSpan() + { + var builder = new ArrowBuffer.Builder(1); + var capacity = builder.Capacity; + var data = Enumerable.Range(0, 10).Select(x => x).ToArray(); + + builder.Append(data); + + Assert.True(builder.Capacity > capacity); + } + + [Fact] + public void LengthIncrementsAfterAppend() + { + var builder = new ArrowBuffer.Builder(1); + var length = builder.Length; + + builder.Append(1); + + Assert.Equal(length + 1, builder.Length); + } + + [Fact] + public void LengthGrowsBySpanLength() + { + var builder = new ArrowBuffer.Builder(1); + var data = Enumerable.Range(0, 10).Select(x => x).ToArray(); + + builder.Append(data); + + Assert.Equal(10, builder.Length); + } + + [Fact] + public void BufferHasExpectedValues() + { + var builder = new ArrowBuffer.Builder(1); + + builder.Append(10); + builder.Append(20); + + var buffer = builder.Build(); + var span = buffer.Span.CastTo(); + + Assert.Equal(10, span[0]); + Assert.Equal(20, span[1]); + Assert.Equal(0, span[2]); + } + } + + public class AppendRange + { + [Fact] + public void CapacityGrowsAfterAppendEnumerable() + { + var builder = new ArrowBuffer.Builder(1); + var capacity = builder.Capacity; + var data = Enumerable.Range(0, 10).Select(x => x); + + builder.AppendRange(data); + + Assert.True(builder.Capacity > capacity); + } + + [Fact] + public void LengthGrowsByEnumerableCount() + { + var builder = new ArrowBuffer.Builder(1); + var length = builder.Length; + var data = Enumerable.Range(0, 10).Select(x => x).ToArray(); + var count = data.Length; + + builder.AppendRange(data); + + Assert.Equal(length + count, builder.Length); + } + + [Fact] + public void BufferHasExpectedValues() + { + var builder = new ArrowBuffer.Builder(1); + var data = Enumerable.Range(0, 10).Select(x => x).ToArray(); + + builder.AppendRange(data); + + var buffer = builder.Build(); + var span = buffer.Span.CastTo(); + + for (var i = 0; i < 10; i++) + { + Assert.Equal(i, span[i]); + } + } + } + + public class Clear + { + [Fact] + public void SetsAllValuesToDefault() + { + var builder = new ArrowBuffer.Builder(1); + var data = Enumerable.Range(0, 10).Select(x => x).ToArray(); + + builder.AppendRange(data); + builder.Clear(); + + var buffer = builder.Build(); + var zeros = Enumerable.Range(0, 10).Select(x => 0).ToArray(); + var values = buffer.Span.CastTo().Slice(0, 10).ToArray(); + + Assert.True(zeros.SequenceEqual(values)); + } + } + + } +} diff --git a/csharp/test/Apache.Arrow.Tests/ArrowBufferTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowBufferTests.cs index 28de056a61166..f618a9bcb65c4 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowBufferTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowBufferTests.cs @@ -13,10 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Runtime.CompilerServices; -using Apache.Arrow.Memory; using Apache.Arrow.Tests.Fixtures; +using System; using Xunit; namespace Apache.Arrow.Tests @@ -34,19 +32,20 @@ public Allocate(DefaultMemoryPoolFixture memoryPoolFixture) } /// - /// Ensure Arrow buffers are allocated in multiples of 8-bytes. + /// Ensure Arrow buffers are allocated in multiples of 64 bytes. /// /// number of bytes to allocate /// expected buffer capacity after allocation [Theory] - [InlineData(1, 8)] - [InlineData(8, 8)] - [InlineData(9, 16)] - [InlineData(16, 16)] + [InlineData(1, 64)] + [InlineData(8, 64)] + [InlineData(9, 64)] + [InlineData(65, 128)] public void AllocatesWithExpectedPadding(int size, int expectedCapacity) { - var buffer = ArrowBuffer.Allocate(size, _memoryPoolFixture.MemoryPool); - Assert.Equal(buffer.Capacity, expectedCapacity); + var buffer = new ArrowBuffer.Builder(size).Build(); + + Assert.Equal(buffer.Length, expectedCapacity); } /// @@ -59,12 +58,11 @@ public void AllocatesWithExpectedPadding(int size, int expectedCapacity) [InlineData(128)] public unsafe void AllocatesAlignedToMultipleOf64(int size) { - var buffer = ArrowBuffer.Allocate(size, _memoryPoolFixture.MemoryPool); + var buffer = new ArrowBuffer.Builder(size).Build(); - using (var pin = buffer.Memory.Pin()) - { - var ptr = new IntPtr(pin.Pointer); - Assert.True(ptr.ToInt64() % 64 == 0); + fixed (byte* ptr = &buffer.Span.GetPinnableReference()) + { + Assert.True(new IntPtr(ptr).ToInt64() % 64 == 0); } } @@ -74,10 +72,9 @@ public unsafe void AllocatesAlignedToMultipleOf64(int size) [Fact] public void HasZeroPadding() { - var buffer = ArrowBuffer.Allocate(32, _memoryPoolFixture.MemoryPool); - var span = buffer.GetSpan(); - - foreach (var b in span) + var buffer = new ArrowBuffer.Builder(10).Build(); + + foreach (var b in buffer.Span) { Assert.Equal(0, b); } diff --git a/csharp/test/Apache.Arrow.Tests/Fixtures/DefaultMemoryPoolFixture.cs b/csharp/test/Apache.Arrow.Tests/Fixtures/DefaultMemoryPoolFixture.cs index a87bfae1353fd..3b867cdb3d69b 100644 --- a/csharp/test/Apache.Arrow.Tests/Fixtures/DefaultMemoryPoolFixture.cs +++ b/csharp/test/Apache.Arrow.Tests/Fixtures/DefaultMemoryPoolFixture.cs @@ -23,10 +23,9 @@ public class DefaultMemoryPoolFixture public DefaultMemoryPoolFixture() { - const int padding = 8; const int alignment = 64; - MemoryPool = new NativeMemoryPool(padding, alignment); + MemoryPool = new NativeMemoryPool(alignment); } } } diff --git a/dev/README.md b/dev/README.md index 98aeef6d9a4d8..ead36d3747e76 100644 --- a/dev/README.md +++ b/dev/README.md @@ -28,17 +28,22 @@ https://gitbox.apache.org/setup/ to be able to push to GitHub as the main remote. * How to merge a Pull request: -have an apache and apache-github remote setup + ``` -git remote add apache-github https://github.com/apache/arrow.git git remote add apache git@github.com:apache/arrow.git ``` + run the following command + ``` dev/merge_arrow_pr.py ``` +This uses the GitHub REST API; if you encounter rate limit issues, you may set +a `ARROW_GITHUB_API_TOKEN` environment variable to use a Personal Access Token. + Note: + * The directory name of your Arrow git clone must be called arrow * Without jira-python installed you'll have to close the JIRA manually diff --git a/dev/dask_integration.sh b/dev/dask_integration.sh deleted file mode 100755 index d344328b6af1e..0000000000000 --- a/dev/dask_integration.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Pass the service name to run_docker_compose.sh -# Which validates environment and runs the service -exec "$(dirname ${BASH_SOURCE})"/run_docker_compose.sh dask_integration diff --git a/dev/dask_integration/Dockerfile b/dev/dask_integration/Dockerfile deleted file mode 100644 index f0c1f03f6f93c..0000000000000 --- a/dev/dask_integration/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -FROM arrow_integration_xenial_base - -ADD . /apache-arrow -WORKDIR /apache-arrow - -CMD arrow/dev/dask_integration/dask_integration.sh diff --git a/dev/dask_integration/dask_integration.sh b/dev/dask_integration/dask_integration.sh deleted file mode 100755 index f4999c0ae447f..0000000000000 --- a/dev/dask_integration/dask_integration.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set up environment and working directory -cd /apache-arrow - -conda activate pyarrow-dev - -# install pytables from defaults for now -conda install -y pytables - -pip install -q git+https://github.com/dask/partd --upgrade --no-deps -pip install -q git+https://github.com/dask/zict --upgrade --no-deps -pip install -q git+https://github.com/dask/distributed --upgrade --no-deps -pip install -q git+https://github.com/mrocklin/sparse --upgrade --no-deps -pip install -q git+https://github.com/dask/s3fs --upgrade --no-deps - -conda install -y -q -c conda-forge numba cython \ - bcolz \ - blosc \ - bokeh \ - boto3 \ - chest \ - cloudpickle \ - coverage \ - cytoolz \ - distributed \ - graphviz \ - h5py \ - partd \ - psutil \ - "pytest<=3.1.1" \ - scikit-image \ - scikit-learn \ - sqlalchemy \ - toolz - -pip install -q git+https://github.com/dask/fastparquet - -pip install -q \ - cachey \ - graphviz \ - moto \ - pyarrow \ - --upgrade --no-deps - -pip install -q \ - cityhash \ - flake8 \ - mmh3 \ - pandas_datareader \ - pytest-xdist \ - xxhash \ - pycodestyle - -export ARROW_BUILD_TYPE=release -export ARROW_HOME=$(pwd)/dist -export PARQUET_HOME=$(pwd)/dist -CONDA_BASE=/home/ubuntu/miniconda -export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} - -# Allow for --user Python installation inside Docker -export HOME=$(pwd) - -# Clean up and get the dask master branch from github -rm -rf dask .local -export GIT_COMMITTER_NAME="Nobody" -export GIT_COMMITTER_EMAIL="nobody@nowhere.com" -git clone https://github.com/dask/dask.git -pushd dask -pip install --user -e .[complete] -# Verify integrity of the installed dask dataframe code -py.test dask/dataframe/tests/test_dataframe.py -popd - -# Run the integration test -pushd arrow/python/testing -py.test dask_tests -popd - -pushd dask/dask/dataframe/io -py.test tests/test_parquet.py -popd diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index a11b4015c5dd3..19fda7823ac25 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -31,18 +31,6 @@ services: volumes: - ../..:/apache-arrow - spark_integration: - build: - context: spark_integration - volumes: - - ../..:/apache-arrow - - dask_integration: - build: - context: dask_integration - volumes: - - ../..:/apache-arrow - gen_apidocs: build: context: .. @@ -50,12 +38,6 @@ services: volumes: - ..:/arrow - iwyu: - build: - context: iwyu - volumes: - - ../..:/apache-arrow - run_site: build: context: run_site diff --git a/dev/lint/Dockerfile b/dev/lint/Dockerfile index 71d7ec85a8e2e..c7901e1f90e32 100644 --- a/dev/lint/Dockerfile +++ b/dev/lint/Dockerfile @@ -24,7 +24,7 @@ RUN apt-get install -y -q \ clang-tidy \ iwyu -RUN conda install -c conda-forge flake8 && \ +RUN conda install flake8 && \ conda clean --all -y # https://bugs.launchpad.net/ubuntu/+source/iwyu/+bug/1769334 diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 8539d5d3401fd..5a926f5f6d17a 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -24,8 +24,17 @@ # This utility assumes you already have a local Arrow git clone and that you # have added remotes corresponding to both (i) the Github Apache Arrow mirror # and (ii) the apache git repo. +# +# There are several pieces of authorization possibly needed via environment +# variables +# +# JIRA_USERNAME: your Apache JIRA id +# JIRA_PASSWORD: your Apache JIRA password +# ARROW_GITHUB_API_TOKEN: a GitHub API token to use for API requests (to avoid +# rate limiting) import os +import pprint import re import subprocess import sys @@ -38,8 +47,8 @@ try: import jira.client except ImportError: - print("Could not find jira-python library. " - "Run 'sudo pip install jira-python' to install.") + print("Could not find jira library. " + "Run 'sudo pip install jira' to install.") print("Exiting without trying to close the associated JIRA.") sys.exit(1) @@ -48,12 +57,10 @@ BRANCH_PREFIX = "PR_TOOL" JIRA_API_BASE = "https://issues.apache.org/jira" - -def get_json(url): - req = requests.get(url) +def get_json(url, headers=None): + req = requests.get(url, headers=headers) return req.json() - def run_cmd(cmd): if isinstance(cmd, six.string_types): cmd = cmd.split(' ') @@ -192,8 +199,15 @@ def __init__(self, project_name): self.github_api = ("https://api.github.com/repos/apache/{0}" .format(project_name)) + token = os.environ.get('ARROW_GITHUB_API_TOKEN', None) + if token: + self.headers = {'Authorization': 'token {0}'.format(token)} + else: + self.headers = None + def get_pr_data(self, number): - return get_json("%s/pulls/%s" % (self.github_api, number)) + return get_json("%s/pulls/%s" % (self.github_api, number), + headers=self.headers) class CommandInput(object): @@ -225,13 +239,16 @@ def __init__(self, cmd, github_api, git_remote, jira_con, number): self.con = jira_con self.number = number self._pr_data = github_api.get_pr_data(number) - self.url = self._pr_data["url"] - self.title = self._pr_data["title"] - - self.body = self._pr_data["body"] - self.target_ref = self._pr_data["base"]["ref"] - self.user_login = self._pr_data["user"]["login"] - self.base_ref = self._pr_data["head"]["ref"] + try: + self.url = self._pr_data["url"] + self.title = self._pr_data["title"] + self.body = self._pr_data["body"] + self.target_ref = self._pr_data["base"]["ref"] + self.user_login = self._pr_data["user"]["login"] + self.base_ref = self._pr_data["head"]["ref"] + except KeyError: + pprint.pprint(self._pr_data) + raise self.description = "%s/%s" % (self.user_login, self.base_ref) self.jira_issue = self._get_jira() @@ -435,4 +452,4 @@ def get_version_json(version_str): try: cli() except Exception as e: - print(e.args[0]) + raise diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 9282cbfd2771d..bfcfc83825499 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -21,10 +21,107 @@ set -e SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +update_versions() { + local base_version=$1 + local next_version=$2 + local type=$3 + + case ${type} in + release) + local version=${base_version} + local r_version=${base_version} + ;; + snapshot) + local version=${next_version}-SNAPSHOT + local r_version=${base_version}.9000 + ;; + esac + + cd "${SOURCE_DIR}/../../cpp" + sed -i.bak -E -e \ + "s/^set\(ARROW_VERSION \".+\"\)/set(ARROW_VERSION \"${version}\")/" \ + CMakeLists.txt + rm -f CMakeLists.txt.bak + git add CMakeLists.txt + cd - + + cd "${SOURCE_DIR}/../../c_glib" + sed -i.bak -E -e \ + "s/^m4_define\(\[arrow_glib_version\], .+\)/m4_define([arrow_glib_version], ${version})/" \ + configure.ac + sed -i.bak -E -e \ + "s/^version = '.+'/version = '${version}'/" \ + meson.build + rm -f configure.ac.bak meson.build.bak + git add configure.ac meson.build + cd - + + # We can enable this when Arrow JS uses the same version. + # cd "${SOURCE_DIR}/../../js" + # sed -i.bak -E -e \ + # "s/^ \"version\": \".+\"/ \"version\": \"${version}\"/" \ + # package.json + # rm -f package.json + # git add package.json + # cd - + + cd "${SOURCE_DIR}/../../matlab" + sed -i.bak -E -e \ + "s/^set\(MLARROW_VERSION \".+\"\)/set(MLARROW_VERSION \"${version}\")/" \ + CMakeLists.txt + rm -f CMakeLists.txt.bak + git add CMakeLists.txt + cd - + + cd "${SOURCE_DIR}/../../python" + sed -i.bak -E -e \ + "s/^default_version = '.+'/default_version = '${version}'/" \ + setup.py + rm -f setup.py.bak + git add setup.py + cd - + + cd "${SOURCE_DIR}/../../r" + sed -i.bak -E -e \ + "s/^Version: .+/Version: ${r_version}/" \ + DESCRIPTION + rm -f DESCRIPTION.bak + git add DESCRIPTION + cd - + + cd "${SOURCE_DIR}/../../ruby" + sed -i.bak -E -e \ + "s/^ VERSION = \".+\"/ VERSION = \"${version}\"/g" \ + */*/*/version.rb + rm -f */*/*/version.rb.bak + git add */*/*/version.rb + cd - + + cd "${SOURCE_DIR}/../../rust" + sed -i.bak -E -e \ + "s/^version = \".+\"/version = \"${version}\"/g" \ + arrow/Cargo.toml parquet/Cargo.toml + rm -f arrow/Cargo.toml.bak parquet/Cargo.toml.bak + git add arrow/Cargo.toml parquet/Cargo.toml + + # Update version number for parquet README + sed -i.bak -E -e \ + "s/^parquet = \".+\"/parquet = \"${version}\"/g" \ + parquet/README.md + sed -i.bak -E -e \ + "s/docs.rs\/crate\/parquet\/.+\)/docs.rs\/crate\/parquet\/${version}\)/g" \ + parquet/README.md + rm -f parquet/README.md.bak + git add parquet/README.md + cd - +} + if [ "$#" -eq 2 ]; then + ############################## Pre-Tag Commits ############################## + version=$1 - nextVersion=$2 - nextVersionSNAPSHOT=${nextVersion}-SNAPSHOT + next_version=$2 + next_version_snapshot=${next_version}-SNAPSHOT tag=apache-arrow-${version} echo "Updating changelog for $version" @@ -41,44 +138,51 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb/.rpm changelogs for $version" cd - - echo "prepare release ${version} on tag ${tag} then reset to version ${nextVersionSNAPSHOT}" + echo "prepare release ${version} on tag ${tag} then reset to version ${next_version_snapshot}" - cd "${SOURCE_DIR}/../../java" + update_versions "${version}" "${next_version}" "release" + git commit -m "[Release] Update versions for ${version}" + cd "${SOURCE_DIR}/../../java" mvn release:clean - mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${nextVersionSNAPSHOT} - + mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${next_version_snapshot} cd - - echo "Updating .deb package names for $nextVersion" - deb_lib_suffix=$(echo $version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') - next_deb_lib_suffix=$(echo $nextVersion | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') + ############################## Post-Tag Commits ############################# + + echo "Updating versions for ${next_version_snapshot}" + update_versions "${version}" "${next_version}" "snapshot" + git commit -m "[Release] Update versions for ${next_version_snapshot}" + + echo "Updating .deb package names for ${next_version}" + deb_lib_suffix=$(echo $version | sed -E -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') + next_deb_lib_suffix=$(echo $next_version | sed -E -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') cd $SOURCE_DIR/../tasks/linux-packages/ for target in debian*/lib*${deb_lib_suffix}.install; do git mv \ ${target} \ $(echo $target | sed -e "s/${deb_lib_suffix}/${next_deb_lib_suffix}/") done - deb_lib_suffix_substitute_pattern="s/(lib(arrow|parquet)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" - sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" debian*/control + deb_lib_suffix_substitute_pattern="s/(lib(arrow|gandiva|parquet|plasma)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" + sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" debian*/control rm -f debian*/control.bak git add debian*/control cd - cd $SOURCE_DIR/../tasks/ - sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" tasks.yml + sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" tasks.yml rm -f tasks.yml.bak git add tasks.yml cd - cd $SOURCE_DIR - sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt + sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt rm -f rat_exclude_files.txt.bak git add rat_exclude_files.txt - git commit -m "[Release] Update .deb package names for $nextVersion" + git commit -m "[Release] Update .deb package names for $next_version" cd - echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else - echo "Usage: $0 " + echo "Usage: $0 " exit fi diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index e224584223b4c..85dee3302e917 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -45,46 +45,28 @@ echo "Using commit $release_hash" tarball=${tag}.tar.gz -extract_dir=tmp-apache-arrow -rm -rf ${extract_dir} +archive_name=tmp-apache-arrow # be conservative and use the release hash, even though git produces the same # archive (identical hashes) using the scm tag -git archive ${release_hash} --prefix ${extract_dir}/ | tar xf - - -# build Apache Arrow C++ before building Apache Arrow GLib because -# Apache Arrow GLib requires Apache Arrow C++. -mkdir -p ${extract_dir}/cpp/build -cpp_install_dir=${PWD}/${extract_dir}/cpp/install -cd ${extract_dir}/cpp/build -cmake .. \ - -DCMAKE_INSTALL_PREFIX=${cpp_install_dir} \ - -DCMAKE_INSTALL_LIBDIR=${cpp_install_dir}/lib \ - -DARROW_BUILD_TESTS=no \ - -DARROW_PARQUET=yes -make -j8 -make install -cd - - -# build source archive for Apache Arrow GLib by "make dist". -cd ${extract_dir}/c_glib -./autogen.sh -./configure \ - PKG_CONFIG_PATH=$cpp_install_dir/lib/pkgconfig \ - --enable-gtk-doc -LD_LIBRARY_PATH=$cpp_install_dir/lib:$LD_LIBRARY_PATH make -j8 -make dist -tar xzf *.tar.gz -rm *.tar.gz -cd - -rm -rf tmp-c_glib/ -mv ${extract_dir}/c_glib/apache-arrow-glib-* tmp-c_glib/ -rm -rf ${extract_dir} +git archive ${release_hash} --prefix ${archive_name}/ > ${archive_name}.tar.gz + +dist_c_glib_tar_gz=c_glib.tar.gz +docker_image_name=apache-arrow/release-source +DEBUG=yes docker build -t ${docker_image_name} ${SOURCE_DIR}/source +docker \ + run \ + --rm \ + --interactive \ + --volume "$PWD":/host \ + ${docker_image_name} \ + /build.sh ${archive_name} ${dist_c_glib_tar_gz} # replace c_glib/ by tar.gz generated by "make dist" rm -rf ${tag} git archive $release_hash --prefix ${tag}/ | tar xf - rm -rf ${tag}/c_glib -mv tmp-c_glib ${tag}/c_glib +tar xf ${dist_c_glib_tar_gz} -C ${tag} +rm -f ${dist_c_glib_tar_gz} # Create new tarball from modified source directory tar czhf ${tarball} ${tag} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 0baf29edd83e4..4866ec2aa3c30 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -13,18 +13,10 @@ cpp/src/arrow/io/mman.h cpp/src/arrow/util/random.h cpp/src/arrow/status.cc cpp/src/arrow/status.h -cpp/src/arrow/util/string_view/string_view.hpp -cpp/src/arrow/util/variant.h -cpp/src/arrow/util/variant/optional.h -cpp/src/arrow/util/variant/recursive_wrapper.h -cpp/src/arrow/util/variant/variant_cast.h -cpp/src/arrow/util/variant/variant_io.h -cpp/src/arrow/util/variant/variant_visitor.h -cpp/src/arrow/util/xxhash/xxhash.c -cpp/src/arrow/util/xxhash/xxhash.h +cpp/src/arrow/vendored/* cpp/build-support/asan_symbolize.py cpp/build-support/cpplint.py -cpp/build-support/clang_format_exclusions.txt +cpp/build-support/lint_exclusions.txt cpp/build-support/iwyu/* cpp/cmake_modules/BuildUtils.cmake cpp/cmake_modules/FindPythonLibsNew.cmake @@ -54,23 +46,23 @@ dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib-doc.doc-base dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib-doc.install dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib-doc.links -dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib12.install -dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-python12.install -dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow12.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib13.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-python13.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow13.install dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib-doc.doc-base dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib-doc.install dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib-doc.links -dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib12.install -dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet12.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib13.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet13.install dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.doc-base dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.install dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.links -dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install -dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib13.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma13.install dev/tasks/linux-packages/debian.ubuntu-trusty/patches/series dev/tasks/linux-packages/debian.ubuntu-trusty/plasma-store-server.install dev/tasks/linux-packages/debian.ubuntu-trusty/rules @@ -80,6 +72,7 @@ dev/tasks/linux-packages/debian/compat dev/tasks/linux-packages/debian/control dev/tasks/linux-packages/debian/gir1.2-arrow-1.0.install dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install +dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install dev/tasks/linux-packages/debian/gir1.2-parquet-1.0.install dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install dev/tasks/linux-packages/debian/libarrow-dev.install @@ -87,33 +80,41 @@ dev/tasks/linux-packages/debian/libarrow-glib-dev.install dev/tasks/linux-packages/debian/libarrow-glib-doc.doc-base dev/tasks/linux-packages/debian/libarrow-glib-doc.install dev/tasks/linux-packages/debian/libarrow-glib-doc.links -dev/tasks/linux-packages/debian/libarrow-glib12.install +dev/tasks/linux-packages/debian/libarrow-glib13.install dev/tasks/linux-packages/debian/libarrow-cuda-dev.install dev/tasks/linux-packages/debian/libarrow-cuda-glib-dev.install -dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install -dev/tasks/linux-packages/debian/libarrow-cuda12.install +dev/tasks/linux-packages/debian/libarrow-cuda-glib13.install +dev/tasks/linux-packages/debian/libarrow-cuda13.install dev/tasks/linux-packages/debian/libarrow-python-dev.install -dev/tasks/linux-packages/debian/libarrow-python12.install -dev/tasks/linux-packages/debian/libarrow12.install +dev/tasks/linux-packages/debian/libarrow-python13.install +dev/tasks/linux-packages/debian/libarrow13.install +dev/tasks/linux-packages/debian/libgandiva-dev.install +dev/tasks/linux-packages/debian/libgandiva-glib-dev.install +dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base +dev/tasks/linux-packages/debian/libgandiva-glib-doc.install +dev/tasks/linux-packages/debian/libgandiva-glib-doc.links +dev/tasks/linux-packages/debian/libgandiva-glib13.install +dev/tasks/linux-packages/debian/libgandiva13.install dev/tasks/linux-packages/debian/libparquet-dev.install dev/tasks/linux-packages/debian/libparquet-glib-dev.install dev/tasks/linux-packages/debian/libparquet-glib-doc.doc-base dev/tasks/linux-packages/debian/libparquet-glib-doc.install dev/tasks/linux-packages/debian/libparquet-glib-doc.links -dev/tasks/linux-packages/debian/libparquet-glib12.install -dev/tasks/linux-packages/debian/libparquet12.install +dev/tasks/linux-packages/debian/libparquet-glib13.install +dev/tasks/linux-packages/debian/libparquet13.install dev/tasks/linux-packages/debian/libplasma-dev.install dev/tasks/linux-packages/debian/libplasma-glib-dev.install dev/tasks/linux-packages/debian/libplasma-glib-doc.doc-base dev/tasks/linux-packages/debian/libplasma-glib-doc.install dev/tasks/linux-packages/debian/libplasma-glib-doc.links -dev/tasks/linux-packages/debian/libplasma-glib12.install -dev/tasks/linux-packages/debian/libplasma12.install +dev/tasks/linux-packages/debian/libplasma-glib13.install +dev/tasks/linux-packages/debian/libplasma13.install dev/tasks/linux-packages/debian/patches/series dev/tasks/linux-packages/debian/plasma-store-server.install dev/tasks/linux-packages/debian/rules dev/tasks/linux-packages/debian/source/format dev/tasks/linux-packages/debian/watch +dev/tasks/conda-recipes/variants/*.yaml docs/requirements.txt go/arrow/go.sum go/arrow/Gopkg.lock @@ -129,6 +130,8 @@ python/MANIFEST.in python/pyarrow/includes/__init__.pxd python/pyarrow/tests/__init__.py python/requirements.txt +python/requirements-test.txt +python/requirements-wheel.txt pax_global_header MANIFEST.in __init__.pxd @@ -184,4 +187,6 @@ r/README.md r/README.Rmd r/man/*.Rd .gitattributes -rust/test/data/*.csv +ruby/red-arrow/.yardopts +rust/arrow/test/data/*.csv +rust/rust-toolchain diff --git a/dev/release/source/Dockerfile b/dev/release/source/Dockerfile new file mode 100644 index 0000000000000..70ed8aa866dd0 --- /dev/null +++ b/dev/release/source/Dockerfile @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:18.04 + +ENV DEBIAN_FRONTEND noninteractive + +RUN \ + apt update && \ + apt install -y -V \ + autoconf-archive \ + bison \ + clang-6.0 \ + cmake \ + flex \ + g++ \ + gcc \ + gtk-doc-tools \ + libboost-filesystem-dev \ + libboost-regex-dev \ + libboost-system-dev \ + libgirepository1.0-dev \ + libglib2.0-doc \ + libprotobuf-dev \ + libprotoc-dev \ + libtool \ + lsb-release \ + make \ + pkg-config \ + protobuf-compiler && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +COPY build.sh /build.sh diff --git a/dev/release/source/build.sh b/dev/release/source/build.sh new file mode 100755 index 0000000000000..25775fdc3e813 --- /dev/null +++ b/dev/release/source/build.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +archive_name=$1 +dist_c_glib_tar_gz=$2 + +tar xf /host/${archive_name}.tar.gz + +# build Apache Arrow C++ before building Apache Arrow GLib because +# Apache Arrow GLib requires Apache Arrow C++. +mkdir -p ${archive_name}/cpp/build +cpp_install_dir=${PWD}/${archive_name}/cpp/install +cd ${archive_name}/cpp/build +cmake .. \ + -DCMAKE_INSTALL_PREFIX=${cpp_install_dir} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_PLASMA=yes \ + -DARROW_GANDIVA=yes \ + -DARROW_PARQUET=yes +make -j8 +make install +cd - + +# build source archive for Apache Arrow GLib by "make dist". +cd ${archive_name}/c_glib +./autogen.sh +./configure \ + PKG_CONFIG_PATH=${cpp_install_dir}/lib/pkgconfig \ + --enable-gtk-doc +LD_LIBRARY_PATH=${cpp_install_dir}/lib make -j8 +make dist +tar xzf *.tar.gz +rm *.tar.gz +cd - +mv ${archive_name}/c_glib/apache-arrow-glib-* c_glib/ +tar czf /host/${dist_c_glib_tar_gz} c_glib diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index cc25b045dce47..f5f9e964231b6 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -46,12 +46,11 @@ call conda create -p %_VERIFICATION_CONDA_ENV% -f -q -y python=%PYTHON% || exit call activate %_VERIFICATION_CONDA_ENV% || exit /B call conda install -y ^ - six pytest setuptools numpy pandas cython ^ - thrift-cpp flatbuffers rapidjson ^ - cmake ^ - git ^ - boost-cpp ^ - snappy zlib brotli gflags lz4-c zstd -c conda-forge || exit /B + python=3.7 ^ + git ^ + --file=ci\conda_env_cpp.yml ^ + --file=ci\conda_env_python.yml ^ + -c conda-forge || exit /B set GENERATOR=Visual Studio 14 2015 Win64 set CONFIGURATION=release @@ -74,6 +73,7 @@ call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tool cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ -DARROW_BOOST_USE_SHARED=ON ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ -DARROW_PYTHON=ON ^ diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 5b666630d17a0..3694c867dcb20 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -51,10 +51,10 @@ HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd) ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' -: ${ARROW_HAVE_GPU:=} -if [ -z "$ARROW_HAVE_GPU" ]; then +: ${ARROW_HAVE_CUDA:=} +if [ -z "$ARROW_HAVE_CUDA" ]; then if nvidia-smi --list-gpus 2>&1 > /dev/null; then - ARROW_HAVE_GPU=yes + ARROW_HAVE_CUDA=yes fi fi @@ -87,24 +87,51 @@ fetch_archive() { shasum -a 512 -c ${dist_name}.tar.gz.sha512 } +bintray() { + local command=$1 + shift + local path=$1 + shift + local url=https://bintray.com/api/v1${path} + echo "${command} ${url}" 1>&2 + curl \ + --fail \ + --request ${command} \ + ${url} \ + "$@" | \ + jq . +} + +download_bintray_files() { + local target=$1 + + local version_name=${VERSION}-rc${RC_NUMBER} + + bintray \ + GET /packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \ + jq -r ".[].path" | \ + while read file; do + mkdir -p "$(dirname ${file})" + curl \ + --fail \ + --location \ + --output ${file} \ + https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file} + done +} + verify_binary_artifacts() { - # --show-progress not supported on wget < 1.16 - wget --help | grep -q '\--show-progress' && \ - _WGET_PROGRESS_OPT="-q --show-progress" || _WGET_PROGRESS_OPT="" - - # download the binaries folder for the current RC - rcname=apache-arrow-${VERSION}-rc${RC_NUMBER} - wget -P "$rcname" \ - --quiet \ - --no-host-directories \ - --cut-dirs=5 \ - $_WGET_PROGRESS_OPT \ - --no-parent \ - --reject 'index.html*' \ - --recursive "$ARROW_DIST_URL/$rcname/binaries/" + local download_dir=binaries + mkdir -p ${download_dir} + pushd ${download_dir} + + # takes longer on slow network + for target in centos debian python ubuntu; do + download_bintray_files ${target} + done # verify the signature and the checksums of each artifact - find $rcname/binaries -name '*.asc' | while read sigfile; do + find . -name '*.asc' | while read sigfile; do artifact=${sigfile/.asc/} gpg --verify $sigfile $artifact || exit 1 @@ -112,10 +139,14 @@ verify_binary_artifacts() { # basename of the artifact pushd $(dirname $artifact) base_artifact=$(basename $artifact) - shasum -a 256 -c $base_artifact.sha256 || exit 1 + if [ -f $base_artifact.sha256 ]; then + shasum -a 256 -c $base_artifact.sha256 || exit 1 + fi shasum -a 512 -c $base_artifact.sha512 || exit 1 popd done + + popd } setup_tempdir() { @@ -143,12 +174,13 @@ setup_miniconda() { . $MINICONDA/etc/profile.d/conda.sh - conda create -n arrow-test -y -q python=3.6 \ + conda create -n arrow-test -y -q -c conda-forge \ + python=3.6 \ nomkl \ numpy \ pandas \ six \ - cython -c conda-forge + cython conda activate arrow-test } @@ -159,18 +191,21 @@ test_and_install_cpp() { pushd cpp/build ARROW_CMAKE_OPTIONS=" +${ARROW_CMAKE_OPTIONS} -DCMAKE_INSTALL_PREFIX=$ARROW_HOME --DCMAKE_INSTALL_LIBDIR=$ARROW_HOME/lib +-DCMAKE_INSTALL_LIBDIR=lib -DARROW_PLASMA=ON -DARROW_ORC=ON -DARROW_PYTHON=ON +-DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_BOOST_USE_SHARED=ON -DCMAKE_BUILD_TYPE=release +-DARROW_BUILD_TESTS=ON -DARROW_BUILD_BENCHMARKS=ON " - if [ "$ARROW_HAVE_GPU" = "yes" ]; then - ARROW_CMAKE_OPTIONS="$ARROW_CMAKE_OPTIONS -DARROW_GPU=ON" + if [ "$ARROW_HAVE_CUDA" = "yes" ]; then + ARROW_CMAKE_OPTIONS="$ARROW_CMAKE_OPTIONS -DARROW_CUDA=ON" fi cmake $ARROW_CMAKE_OPTIONS .. @@ -189,7 +224,7 @@ test_and_install_cpp() { test_python() { pushd python - pip install -r requirements.txt + pip install -r requirements.txt -r requirements-test.txt python setup.py build_ext --inplace --with-parquet --with-plasma py.test pyarrow -v --pdb @@ -211,8 +246,6 @@ test_glib() { gem install bundler fi - # Workaround for 0.11.0. 0.11.0 doesn't include c_glib/Gemfile. - wget https://raw.githubusercontent.com/apache/arrow/master/c_glib/Gemfile bundle install --path vendor/bundle bundle exec ruby test/run-test.rb @@ -240,17 +273,17 @@ test_js() { test_ruby() { pushd ruby - pushd red-arrow - bundle install --path vendor/bundle - bundle exec ruby test/run-test.rb - popd + local modules="red-arrow red-plasma red-gandiva red-parquet" + if [ "${ARROW_HAVE_CUDA}" = "yes" ]; then + modules="${modules} red-arrow-cuda" + fi - if [ "$ARROW_HAVE_GPU" = "yes" ]; then - pushd red-arrow-gpu + for module in ${modules}; do + pushd ${module} bundle install --path vendor/bundle bundle exec ruby test/run-test.rb popd - fi + done popd } @@ -276,9 +309,7 @@ test_rust() { cargo fmt --all -- --check # raises on any warnings - cargo rustc -- -D warnings - - cargo build + RUSTFLAGS="-D warnings" cargo build cargo test popd @@ -331,21 +362,58 @@ if [ "$ARTIFACT" == "source" ]; then TARBALL=apache-arrow-$1.tar.gz DIST_NAME="apache-arrow-${VERSION}" + # By default test all functionalities. + # To deactivate one test, deactivate the test and all of its dependents + # To explicitly select one test, set TEST_DEFAULT=0 TEST_X=1 + : ${TEST_DEFAULT:=1} + : ${TEST_JAVA:=${TEST_DEFAULT}} + : ${TEST_CPP:=${TEST_DEFAULT}} + : ${TEST_GLIB:=${TEST_DEFAULT}} + : ${TEST_RUBY:=${TEST_DEFAULT}} + : ${TEST_PYTHON:=${TEST_DEFAULT}} + : ${TEST_JS:=${TEST_DEFAULT}} + : ${TEST_INTEGRATION:=${TEST_DEFAULT}} + : ${TEST_RUST:=${TEST_DEFAULT}} + + # Automatically test if its activated by a dependent + TEST_GLIB=$((${TEST_GLIB} + ${TEST_RUBY})) + TEST_PYTHON=$((${TEST_PYTHON} + ${TEST_INTEGRATION})) + TEST_CPP=$((${TEST_CPP} + ${TEST_GLIB} + ${TEST_PYTHON})) + TEST_JAVA=$((${TEST_JAVA} + ${TEST_INTEGRATION})) + TEST_JS=$((${TEST_JS} + ${TEST_INTEGRATION})) + fetch_archive $DIST_NAME tar xvzf ${DIST_NAME}.tar.gz cd ${DIST_NAME} - test_package_java - setup_miniconda - test_and_install_cpp - test_python - test_glib - test_ruby - test_js - test_integration - test_rust + if [ ${TEST_JAVA} -gt 0 ]; then + test_package_java + fi + if [ ${TEST_CPP} -gt 0 ]; then + setup_miniconda + test_and_install_cpp + fi + if [ ${TEST_PYTHON} -gt 0 ]; then + test_python + fi + if [ ${TEST_GLIB} -gt 0 ]; then + test_glib + fi + if [ ${TEST_RUBY} -gt 0 ]; then + test_ruby + fi + if [ ${TEST_JS} -gt 0 ]; then + test_js + fi + if [ ${TEST_INTEGRATION} -gt 0 ]; then + test_integration + fi + if [ ${TEST_RUST} -gt 0 ]; then + test_rust + fi else - # takes longer on slow network + : ${BINTRAY_REPOSITORY:=apache/arrow} + verify_binary_artifacts fi diff --git a/dev/spark_integration/Dockerfile b/dev/spark_integration/Dockerfile deleted file mode 100644 index 84e353a9b5096..0000000000000 --- a/dev/spark_integration/Dockerfile +++ /dev/null @@ -1,67 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -FROM maven:3.5.2-jdk-8-slim - -# Basic OS utilities -RUN apt-get update \ - && apt-get install -y \ - wget \ - git \ - pkg-config \ - build-essential \ - software-properties-common \ - && apt-get clean - -# install conda in /home/ubuntu/miniconda -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O conda.sh \ - && /bin/bash conda.sh -b -p /opt/conda \ - && rm conda.sh - -ENV PATH="/opt/conda/bin:${PATH}" - -RUN conda create -y -q -c conda-forge -n pyarrow-dev \ - python=2.7 \ - ipython \ - nomkl \ - numpy \ - six \ - setuptools \ - cython \ - pandas \ - pytest \ - cmake \ - flatbuffers \ - rapidjson \ - boost-cpp \ - thrift-cpp \ - snappy \ - zlib \ - gflags \ - brotli \ - lz4-c \ - zstd \ - setuptools \ - setuptools_scm \ - && conda clean --all - -ADD . /apache-arrow -WORKDIR /apache-arrow - -CMD arrow/dev/spark_integration/spark_integration.sh - -# BUILD: $ docker build -f arrow/dev/spark_integration/Dockerfile -t spark-arrow . -# RUN: $ docker run -v $HOME/.m2:/root/.m2 spark-arrow diff --git a/dev/spark_integration/spark_integration.sh b/dev/spark_integration/spark_integration.sh deleted file mode 100755 index 1f6a2733385f4..0000000000000 --- a/dev/spark_integration/spark_integration.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Exit on any error -set -e - -# Set up environment and working directory -cd /apache-arrow - -# Activate our pyarrow-dev conda env -conda activate pyarrow-dev - -export ARROW_HOME=$(pwd)/arrow -export ARROW_BUILD_TYPE=release -export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX -export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" - -# Build Arrow C++ -pushd arrow/cpp -rm -rf build/* -mkdir -p build -cd build/ -cmake -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -DARROW_PYTHON=on -DARROW_HDFS=on -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. -make -j4 -make install -popd - -# Build pyarrow and install inplace -export PYARROW_CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -pushd arrow/python -python setup.py clean -python setup.py build_ext --build-type=$ARROW_BUILD_TYPE install -popd - -# Install Arrow to local maven repo and get the version -pushd arrow/java -echo "Building and installing Arrow Java" -mvn -DskipTests -Drat.skip=true clean install -ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'` -echo "Using Arrow version $ARROW_VERSION" -popd - -# Build Spark with Arrow -SPARK_REPO=git://git.apache.org/spark.git -SPARK_BRANCH=master - -# Get the Spark repo if not in image already -if [ ! -d "$(pwd)/spark" ]; then - export GIT_COMMITTER_NAME="Nobody" - export GIT_COMMITTER_EMAIL="nobody@nowhere.com" - git clone "$SPARK_REPO" -fi - -pushd spark - -# Make sure branch has no modifications -git checkout "$SPARK_BRANCH" -git reset --hard HEAD - -# Update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark -sed -i -e "s/\(.*\).*\(<\/arrow.version>\)/\1$ARROW_VERSION\2/g" ./pom.xml -echo "Building Spark with Arrow $ARROW_VERSION" -build/mvn -DskipTests clean package - -# Run Arrow related Scala tests only, NOTE: -Dtest=_NonExist_ is to enable surefire test discovery without running any tests so that Scalatest can run -SPARK_SCALA_TESTS="org.apache.spark.sql.execution.arrow,org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite,org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite" -echo "Testing Spark: $SPARK_SCALA_TESTS" -# TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working -build/mvn -Dtest=none -DwildcardSuites="$SPARK_SCALA_TESTS" test - -# Run pyarrow related Python tests only -SPARK_PYTHON_TESTS="ArrowTests PandasUDFTests ScalarPandasUDFTests GroupedMapPandasUDFTests GroupedAggPandasUDFTests WindowPandasUDFTests" -echo "Testing PySpark: $SPARK_PYTHON_TESTS" -SPARK_TESTING=1 bin/pyspark pyspark.sql.tests $SPARK_PYTHON_TESTS -popd diff --git a/dev/tasks/conda-recipes/appveyor.yml b/dev/tasks/conda-recipes/appveyor.yml index cdc9d97537156..8558aa2662a36 100644 --- a/dev/tasks/conda-recipes/appveyor.yml +++ b/dev/tasks/conda-recipes/appveyor.yml @@ -16,7 +16,7 @@ # under the License. environment: - ARROW_VERSION: {{ arrow.version }} + ARROW_VERSION: {{ arrow.no_rc_version }} # regardless of the python version we build against CONDA_INSTALL_LOCN: C:\Miniconda36-x64 @@ -35,25 +35,26 @@ install: - cmd: set PYTHONUNBUFFERED=1 - # Add our channels. - - cmd: conda.exe config --set show_channel_urls true - - cmd: conda.exe config --remove channels defaults - - cmd: conda.exe config --add channels defaults - - cmd: conda.exe config --add channels conda-forge - # Configure the VM. - - cmd: conda.exe install -n root --quiet --yes conda-forge-ci-setup=1 - - cmd: run_conda_forge_build_setup + - cmd: conda.exe install -n root -c conda-forge --quiet --yes conda-forge-ci-setup=2 conda-build # Skip .NET project specific build phase. build: off test_script: + # Clone arrow - git clone -b {{ arrow.branch }} {{ arrow.remote }} arrow || exit /B - git -C arrow checkout {{ arrow.head }} || exit /B - - pushd arrow\dev\tasks\conda-recipes - - conda.exe build --output-folder . -m {{ variant_config_file }} parquet-cpp arrow-cpp pyarrow + + # Configure conda + - cmd: setup_conda_rc .\ .\ variants\{{ config }}.yaml + - cmd: run_conda_forge_build_setup + + # Build the recipes + - conda.exe build --output-folder . -m variants\{{ config }}.yaml parquet-cpp arrow-cpp pyarrow + + # Rename artifacts - pushd win-64 - for %%f in (*.tar.bz2) do ( set %%g=%%~nf diff --git a/dev/tasks/conda-recipes/arrow-cpp/build.sh b/dev/tasks/conda-recipes/arrow-cpp/build.sh index 3ae3bed389234..3d7dbb74595de 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/build.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build.sh @@ -40,7 +40,9 @@ cmake \ -DARROW_PYTHON=ON \ -DARROW_PARQUET=ON \ -DARROW_ORC=ON \ + -DCMAKE_AR=${AR} \ + -DCMAKE_RANLIB=${RANLIB} \ + -GNinja \ .. -make -j${CPU_COUNT} -make install +ninja install diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 725fd2291e75a..92c2e6b7eeee3 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -33,6 +33,7 @@ requirements: build: - cmake - autoconf # [unix] + - ninja - {{ compiler('c') }} - {{ compiler('cxx') }} host: @@ -44,6 +45,7 @@ requirements: - rapidjson - zlib - glog + - gflags - snappy - brotli - zstd diff --git a/dev/tasks/conda-recipes/pyarrow/meta.yaml b/dev/tasks/conda-recipes/pyarrow/meta.yaml index 167056ba68e9c..9f6ae79dc64d7 100644 --- a/dev/tasks/conda-recipes/pyarrow/meta.yaml +++ b/dev/tasks/conda-recipes/pyarrow/meta.yaml @@ -33,6 +33,9 @@ requirements: - {{ compiler('c') }} - {{ compiler('cxx') }} host: + # directly pin boost-cpp as we also seem to directly include boost symbols + # in the Python modules. + - boost-cpp - python - setuptools - setuptools_scm @@ -42,6 +45,7 @@ requirements: - arrow-cpp {{ ARROW_VERSION }} run: + - boost-cpp - python - setuptools - {{ pin_compatible('numpy', lower_bound='1.14') }} @@ -58,6 +62,7 @@ test: requires: - pytest + - hypothesis commands: - pytest --pyargs pyarrow diff --git a/dev/tasks/conda-recipes/travis.linux.yml b/dev/tasks/conda-recipes/travis.linux.yml index c0fc71d230a55..d07553584cb8c 100644 --- a/dev/tasks/conda-recipes/travis.linux.yml +++ b/dev/tasks/conda-recipes/travis.linux.yml @@ -25,7 +25,7 @@ if: tag IS blank env: global: - TRAVIS_TAG={{ task.tag }} - - ARROW_VERSION={{ arrow.version }} + - ARROW_VERSION={{ arrow.no_rc_version }} - PYTHONUNBUFFERED=1 install: @@ -39,22 +39,28 @@ install: curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}" bash $MINICONDA_FILE -b - # Configure conda. + # Install conda build dependency - | echo "" echo "Configuring conda." source /home/travis/miniconda3/bin/activate root - conda config --remove channels defaults - conda config --add channels defaults - conda config --add channels conda-forge - conda config --set show_channel_urls true - conda install --yes --quiet conda-build + conda install -n root -c conda-forge --quiet --yes conda-forge-ci-setup=2 conda-build -script: +before_script: - git clone -b {{ arrow.branch }} {{ arrow.remote }} arrow - git -C arrow checkout {{ arrow.head }} - pushd arrow/dev/tasks/conda-recipes - - conda build --output-folder . -m {{ variant_config_file }} parquet-cpp arrow-cpp pyarrow + + # Configure conda + - setup_conda_rc ./ ./ variants/{{ config }}.yaml + - source run_conda_forge_build_setup + +script: + - | + conda build --croot $TRAVIS_HOME/conda_build_root \ + --output-folder . \ + -m variants/{{ config }}.yaml \ + parquet-cpp arrow-cpp pyarrow deploy: provider: releases diff --git a/dev/tasks/conda-recipes/travis.osx.yml b/dev/tasks/conda-recipes/travis.osx.yml index 193539d8c9f37..99a79b84bcb66 100644 --- a/dev/tasks/conda-recipes/travis.osx.yml +++ b/dev/tasks/conda-recipes/travis.osx.yml @@ -16,7 +16,7 @@ # under the License. os: osx -osx_image: xcode6.4 +osx_image: xcode9.4 language: generic # don't build twice @@ -25,19 +25,9 @@ if: tag IS blank env: global: - TRAVIS_TAG={{ task.tag }} - - ARROW_VERSION={{ arrow.version }} + - ARROW_VERSION={{ arrow.no_rc_version }} - PYTHONUNBUFFERED=1 -before_install: - # Remove homebrew. - - | - echo "" - echo "Removing homebrew from Travis CI to avoid conflicts." - curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/uninstall > ~/uninstall_homebrew - chmod +x ~/uninstall_homebrew - ~/uninstall_homebrew -fq - rm ~/uninstall_homebrew - install: # Install Miniconda. - | @@ -47,24 +37,35 @@ install: MINICONDA_FILE="Miniconda3-latest-MacOSX-x86_64.sh" curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}" bash $MINICONDA_FILE -b - - # Configure conda. + # Install conda build dependency - | echo "" echo "Configuring conda." source /Users/travis/miniconda3/bin/activate root - conda config --remove channels defaults - conda config --add channels defaults - conda config --add channels conda-forge - conda config --set show_channel_urls true - conda install --yes --quiet conda-forge-ci-setup=1 - source run_conda_forge_build_setup + conda install -n root -c conda-forge --quiet --yes conda-forge-ci-setup=2 conda-build -script: + +before_script: - git clone -b {{ arrow.branch }} {{ arrow.remote }} arrow - git -C arrow checkout {{ arrow.head }} - pushd arrow/dev/tasks/conda-recipes - - conda build --output-folder . -m {{ variant_config_file }} parquet-cpp arrow-cpp pyarrow + + # Configure conda + - setup_conda_rc ./ ./ variants/{{ config }}.yaml + + # XXX: workaround, see run_conda_forge_build_setup_osx#L33 + - mkdir -p ./.ci_support + - cp variants/{{ config }}.yaml ./.ci_support/{{ config }}.yaml + - CONFIG={{ config }} source run_conda_forge_build_setup + + # Compiler cleanup + - mangle_compiler ./ ./ ./.ci_support/{{ config }}.yaml + +script: + - | + conda build --output-folder . \ + -m ./.ci_support/{{ config }}.yaml \ + parquet-cpp arrow-cpp pyarrow deploy: provider: releases diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml new file mode 100644 index 0000000000000..149d5fdffcff5 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml @@ -0,0 +1,49 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +c_compiler: +- gcc +cxx_compiler: +- gxx +docker_image: +- condaforge/linux-anvil-comp7 +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '2.7' +snappy: +- 1.1.7 +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml new file mode 100644 index 0000000000000..b71d9de27be0a --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml @@ -0,0 +1,49 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +c_compiler: +- gcc +cxx_compiler: +- gxx +docker_image: +- condaforge/linux-anvil-comp7 +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '3.6' +snappy: +- 1.1.7 +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml new file mode 100644 index 0000000000000..e5dbba5b34b1d --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml @@ -0,0 +1,49 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +c_compiler: +- gcc +cxx_compiler: +- gxx +docker_image: +- condaforge/linux-anvil-comp7 +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '3.7' +snappy: +- 1.1.7 +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_python2.7.yaml b/dev/tasks/conda-recipes/variants/linux_python2.7.yaml deleted file mode 100644 index 45026b07d60ab..0000000000000 --- a/dev/tasks/conda-recipes/variants/linux_python2.7.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '2.7' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_python3.5.yaml b/dev/tasks/conda-recipes/variants/linux_python3.5.yaml deleted file mode 100644 index 683022f834913..0000000000000 --- a/dev/tasks/conda-recipes/variants/linux_python3.5.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.5' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_python3.6.yaml b/dev/tasks/conda-recipes/variants/linux_python3.6.yaml deleted file mode 100644 index 6b7d8896ac369..0000000000000 --- a/dev/tasks/conda-recipes/variants/linux_python3.6.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.6' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml new file mode 100644 index 0000000000000..85d3db6a5a875 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml @@ -0,0 +1,52 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +c_compiler: +- clang +cxx_compiler: +- clangxx +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '2.7' +snappy: +- 1.1.7 +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml new file mode 100644 index 0000000000000..4fd6bd2b52d82 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml @@ -0,0 +1,52 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +c_compiler: +- clang +cxx_compiler: +- clangxx +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '3.6' +snappy: +- 1.1.7 +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml new file mode 100644 index 0000000000000..4e4a8df8bd1b5 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml @@ -0,0 +1,52 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +c_compiler: +- clang +cxx_compiler: +- clangxx +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '3.7' +snappy: +- 1.1.7 +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_python2.7.yaml b/dev/tasks/conda-recipes/variants/osx_python2.7.yaml deleted file mode 100644 index b8fc15f924dd5..0000000000000 --- a/dev/tasks/conda-recipes/variants/osx_python2.7.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -macos_machine: -- x86_64-apple-darwin13.4.0 -macos_min_version: -- '10.9' -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '2.7' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_python3.5.yaml b/dev/tasks/conda-recipes/variants/osx_python3.5.yaml deleted file mode 100644 index 05f7a8dd4d36d..0000000000000 --- a/dev/tasks/conda-recipes/variants/osx_python3.5.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -macos_machine: -- x86_64-apple-darwin13.4.0 -macos_min_version: -- '10.9' -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.5' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_python3.6.yaml b/dev/tasks/conda-recipes/variants/osx_python3.6.yaml deleted file mode 100644 index 6b7d8896ac369..0000000000000 --- a/dev/tasks/conda-recipes/variants/osx_python3.6.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.6' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml deleted file mode 100644 index d886b0e39ff7f..0000000000000 --- a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- vs2015 -cxx_compiler: -- vs2015 -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.5' -snappy: -- 1.1.7 -zip_keys: -- - python - - c_compiler - - cxx_compiler -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml index 880642f5b7d85..5a57d02fd0ea3 100644 --- a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml +++ b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml @@ -1,31 +1,22 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - boost_cpp: -- 1.67.0 +- 1.68.0 c_compiler: - vs2015 +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main cxx_compiler: - vs2015 +libprotobuf: +- '3.6' lz4_c: - 1.8.1 pin_run_as_build: boost-cpp: max_pin: x.x.x + libprotobuf: + max_pin: x.x lz4-c: max_pin: x.x.x python: diff --git a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml new file mode 100644 index 0000000000000..06bd37d5ea7c6 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml @@ -0,0 +1,42 @@ +boost_cpp: +- 1.68.0 +c_compiler: +- vs2015 +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- vs2015 +libprotobuf: +- '3.6' +lz4_c: +- 1.8.1 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + libprotobuf: + max_pin: x.x + lz4-c: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x + snappy: + max_pin: x.x.x + zlib: + max_pin: x.x + zstd: + max_pin: x.x.x +python: +- '3.7' +snappy: +- 1.1.7 +zip_keys: +- - python + - c_compiler + - cxx_compiler +zlib: +- '1.2' +zstd: +- 1.3.3 diff --git a/dev/tasks/crossbow.py b/dev/tasks/crossbow.py index 74facf4b7fa01..2d0c53089d056 100755 --- a/dev/tasks/crossbow.py +++ b/dev/tasks/crossbow.py @@ -396,11 +396,13 @@ def __init__(self, head, branch, remote, version, email=None): self.branch = branch self.remote = remote self.version = version + self.no_rc_version = re.sub(r'-rc\d+\Z', '', version) @classmethod - def from_repo(cls, repo): + def from_repo(cls, repo, version=None): assert isinstance(repo, Repo) - version = get_version(repo.path, local_scheme=lambda v: '') + if version is None: + version = get_version(repo.path, local_scheme=lambda v: '') return cls(head=str(repo.head.target), email=repo.email, branch=repo.branch.branch_name, @@ -587,22 +589,52 @@ def load_tasks_from_config(config_path, task_names, group_names): help='Task configuration yml. Defaults to tasks.yml') @click.option('--arrow-version', '-v', default=None, help='Set target version explicitly') +@click.option('--arrow-repo', '-r', default=None, + help='Set Github repo name explicitly, e.g. apache/arrow, ' + 'kszucs/arrow, this repository is going to be cloned on ' + 'the CI services. Note, that no validation happens locally ' + 'and potentially --arrow-branch and --arrow-sha must be ' + 'defined as well') +@click.option('--arrow-branch', '-b', default='master', + help='Give the branch name explicitly, e.g. master, ARROW-1949.' + 'Only available if --arrow-repo is set.') +@click.option('--arrow-sha', '-t', default='HEAD', + help='Set commit SHA or Tag name explicitly, e.g. f67a515, ' + 'apache-arrow-0.11.1. Only available if both --arrow-repo ' + '--arrow-branch are set.') @click.option('--dry-run/--push', default=False, help='Just display the rendered CI configurations without ' 'submitting them') @click.pass_context -def submit(ctx, task, group, job_prefix, config_path, arrow_version, dry_run): +def submit(ctx, task, group, job_prefix, config_path, arrow_version, + arrow_repo, arrow_branch, arrow_sha, dry_run): queue, arrow = ctx.obj['queue'], ctx.obj['arrow'] - target = Target.from_repo(arrow) - # explicitly set arrow version - if arrow_version: - target.version = arrow_version + if arrow_repo is not None: + values = {'version': arrow_version, + 'branch': arrow_branch, + 'sha': arrow_sha} + for k, v in values.items(): + if not v: + raise ValueError('Must pass --arrow-{} argument'.format(k)) + + # Set repo url, branch and sha explicitly - this aims to make release + # procedure a bit simpler. + # Note, that the target resivion's crossbow templates must be + # compatible with the locally checked out version of crossbow (which is + # in case of the release procedure), because the templates still + # contain some business logic (dependency installation, deployments) + # which will be reduced to a single command in the future. + remote = 'https://github.com/{}'.format(arrow_repo) + target = Target(head=arrow_sha, branch=arrow_branch, remote=remote, + version=arrow_version) + else: + # instantiate target from the locally checked out repository and branch + target = Target.from_repo(arrow, version=arrow_version) - no_rc_version = re.sub(r'-rc\d+\Z', '', target.version) params = { 'version': target.version, - 'no_rc_version': no_rc_version, + 'no_rc_version': target.no_rc_version, } # task and group variables are lists, containing multiple values diff --git a/dev/tasks/gandiva-jars/build-cpp.sh b/dev/tasks/gandiva-jars/build-cpp.sh index a0538cf6f3116..5f0cef3ee6cd0 100755 --- a/dev/tasks/gandiva-jars/build-cpp.sh +++ b/dev/tasks/gandiva-jars/build-cpp.sh @@ -27,8 +27,11 @@ pushd arrow/cpp pushd build cmake -DCMAKE_BUILD_TYPE=Release \ -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_JAVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ + -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BOOST_USE_SHARED=OFF \ .. make -j4 ctest diff --git a/dev/tasks/gandiva-jars/build-java.sh b/dev/tasks/gandiva-jars/build-java.sh index a9b0bfe8eafdd..d099035a3e220 100755 --- a/dev/tasks/gandiva-jars/build-java.sh +++ b/dev/tasks/gandiva-jars/build-java.sh @@ -19,11 +19,19 @@ set -e +source arrow/ci/travis_env_common.sh + +CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp/build/release + pushd arrow/java + if [ $TRAVIS_OS_NAME == "linux" ]; then + ldd $CPP_BUILD_DIR/libgandiva_jni.so + fi + # build the entire project - mvn clean install -DskipTests -P gandiva -Dgandiva.cpp.build.dir=../../cpp/build/release + mvn clean install -DskipTests -P gandiva -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR # test only gandiva - mvn test -P gandiva -pl gandiva -Dgandiva.cpp.build.dir=../../cpp/build/release + mvn test -P gandiva -pl gandiva -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR # copy the jars to distribution folder find gandiva/target/ -name "*.jar" -not -name "*tests*" -exec cp {} ../../dist/ \; popd diff --git a/dev/tasks/gandiva-jars/travis.linux.yml b/dev/tasks/gandiva-jars/travis.linux.yml index 8b311ca962e7b..8526b48a54346 100644 --- a/dev/tasks/gandiva-jars/travis.linux.yml +++ b/dev/tasks/gandiva-jars/travis.linux.yml @@ -32,6 +32,7 @@ env: - ARROW_TRAVIS_USE_TOOLCHAIN=1 before_install: + # gcc 4.9 is required for the static linking of libstdc++ - export CC="gcc-4.9" CXX="g++-4.9" - ulimit -c unlimited -S - | @@ -42,9 +43,9 @@ before_install: before_script: - git clone -b {{ arrow.branch }} {{ arrow.remote }} arrow - git -C arrow checkout {{ arrow.head }} + - export TRAVIS_BUILD_DIR=$TRAVIS_BUILD_DIR/arrow - arrow/ci/travis_install_linux.sh - arrow/ci/travis_install_clang_tools.sh - - export TRAVIS_BUILD_DIR=$TRAVIS_BUILD_DIR/arrow - arrow/ci/travis_install_toolchain.sh script: diff --git a/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile b/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile index 4dde574cbf95d..70cefaabf262e 100644 --- a/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile +++ b/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile @@ -22,6 +22,9 @@ ENV DEBIAN_FRONTEND noninteractive ARG DEBUG RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list +RUN \ + echo "deb http://deb.debian.org/debian stretch-backports main" > \ + /etc/apt/sources.list.d/backports.list RUN \ quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ @@ -30,6 +33,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile index 5d3c9ba2932ed..68de4d569a663 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile index 519d058d4b2e3..0d871eaa2635d 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile index 17cb27713f08c..c7c5b1e09ece1 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/changelog b/dev/tasks/linux-packages/debian.ubuntu-trusty/changelog index e54e05c89a0dd..0aa6dd85a702b 100644 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/changelog +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/changelog @@ -1,3 +1,9 @@ +apache-arrow (0.12.0-1) unstable; urgency=low + + * New upstream release. + + -- Krisztián Szűcs Wed, 16 Jan 2019 03:29:25 -0000 + apache-arrow (0.11.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/control b/dev/tasks/linux-packages/debian.ubuntu-trusty/control index eb1f74b8d4553..9fd699bbb55f2 100644 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/control +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/control @@ -20,7 +20,7 @@ Build-Depends-Indep: libglib2.0-doc Standards-Version: 3.9.6 Homepage: https://arrow.apache.org/ -Package: libarrow12 +Package: libarrow13 Section: libs Architecture: any Multi-Arch: same @@ -38,12 +38,12 @@ Architecture: any Multi-Arch: same Depends: ${misc:Depends}, - libarrow12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides C++ header files. -Package: libplasma12 +Package: libplasma13 Section: libs Architecture: any Multi-Arch: same @@ -51,7 +51,7 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides C++ library files to connect plasma_store_server. @@ -63,7 +63,7 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libplasma12 (= ${binary:Version}) + libplasma13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides plasma_store_server. @@ -75,12 +75,12 @@ Multi-Arch: same Depends: ${misc:Depends}, libarrow-dev (= ${binary:Version}), - libplasma12 (= ${binary:Version}) + libplasma13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides C++ header files. -Package: libparquet12 +Package: libparquet13 Section: libs Architecture: any Multi-Arch: same @@ -99,12 +99,12 @@ Multi-Arch: same Depends: ${misc:Depends}, libarrow-dev (= ${binary:Version}), - libparquet12 (= ${binary:Version}) + libparquet13 (= ${binary:Version}) Description: Apache Parquet is a columnar storage format . This package provides C++ header files. -Package: libarrow-glib12 +Package: libarrow-glib13 Section: libs Architecture: any Multi-Arch: same @@ -112,7 +112,7 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides GLib based library files. @@ -136,7 +136,7 @@ Depends: ${misc:Depends}, libglib2.0-dev, libarrow-dev (= ${binary:Version}), - libarrow-glib12 (= ${binary:Version}), + libarrow-glib13 (= ${binary:Version}), gir1.2-arrow-1.0 (= ${binary:Version}) Suggests: libarrow-glib-doc Description: Apache Arrow is a data processing library for analysis @@ -154,7 +154,7 @@ Description: Apache Arrow is a data processing library for analysis . This package provides documentations. -Package: libplasma-glib12 +Package: libplasma-glib13 Section: libs Architecture: any Multi-Arch: same @@ -162,8 +162,8 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow-glib12 (= ${binary:Version}), - libplasma12 (= ${binary:Version}) + libarrow-glib13 (= ${binary:Version}), + libplasma13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides GLib based library files to connect plasma_store_server. @@ -187,7 +187,7 @@ Depends: ${misc:Depends}, libplasma-dev (= ${binary:Version}), libarrow-glib-dev (= ${binary:Version}), - libplasma-glib12 (= ${binary:Version}), + libplasma-glib13 (= ${binary:Version}), gir1.2-plasma-1.0 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . @@ -204,7 +204,7 @@ Description: Plasma is an in-memory object store and cache for big data. . This package provides documentations. -Package: libparquet-glib12 +Package: libparquet-glib13 Section: libs Architecture: any Multi-Arch: same @@ -212,8 +212,8 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow-glib12 (= ${binary:Version}), - libparquet12 (= ${binary:Version}) + libarrow-glib13 (= ${binary:Version}), + libparquet13 (= ${binary:Version}) Description: Apache Parquet is a columnar storage format . This package provides GLib based library files. @@ -237,7 +237,7 @@ Depends: ${misc:Depends}, libarrow-glib-dev (= ${binary:Version}), libparquet-dev (= ${binary:Version}), - libparquet-glib12 (= ${binary:Version}), + libparquet-glib13 (= ${binary:Version}), gir1.2-parquet-1.0 (= ${binary:Version}) Suggests: libparquet-glib-doc Description: Apache Parquet is a columnar storage format diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-python12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-python13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-python12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-python13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma13.install similarity index 100% rename from dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install rename to dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma13.install diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules index 6f2ffdc416906..4eb26772df00c 100755 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules @@ -22,7 +22,6 @@ override_dh_auto_configure: --builddirectory=cpp_build \ -- \ -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DARROW_BUILD_TESTS=OFF \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_PLASMA=ON diff --git a/dev/tasks/linux-packages/debian/changelog b/dev/tasks/linux-packages/debian/changelog index a0aff6fb28a53..c3a1d58c846d7 100644 --- a/dev/tasks/linux-packages/debian/changelog +++ b/dev/tasks/linux-packages/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (0.12.0-1) unstable; urgency=low + + * New upstream release. + + -- Krisztián Szűcs Wed, 16 Jan 2019 03:29:25 -0000 + apache-arrow (0.11.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/debian/control b/dev/tasks/linux-packages/debian/control index b5c696363798f..6aa5b551aa8d4 100644 --- a/dev/tasks/linux-packages/debian/control +++ b/dev/tasks/linux-packages/debian/control @@ -5,6 +5,7 @@ Maintainer: Kouhei Sutou Build-Depends: autoconf-archive, bison, + clang-6.0, cmake, debhelper (>= 9.20160115), dh-autoreconf, @@ -27,7 +28,7 @@ Build-Depends-Indep: libglib2.0-doc Standards-Version: 3.9.6 Homepage: https://arrow.apache.org/ -Package: libarrow12 +Package: libarrow13 Section: libs Architecture: any Multi-Arch: same @@ -39,7 +40,7 @@ Description: Apache Arrow is a data processing library for analysis . This package provides C++ library files. -Package: libarrow-python12 +Package: libarrow-python13 Section: libs Architecture: any Multi-Arch: same @@ -47,14 +48,14 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow12 (= ${binary:Version}), + libarrow13 (= ${binary:Version}), python3, python3-numpy Description: Apache Arrow is a data processing library for analysis . This package provides C++ library files for Python support. -Package: libarrow-cuda12 +Package: libarrow-cuda13 Section: libs Architecture: any Multi-Arch: same @@ -62,7 +63,7 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides C++ library files for CUDA support. @@ -73,7 +74,7 @@ Architecture: any Multi-Arch: same Depends: ${misc:Depends}, - libarrow12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides C++ header files. @@ -85,7 +86,7 @@ Multi-Arch: same Depends: ${misc:Depends}, libarrow-dev (= ${binary:Version}), - libarrow-python12 (= ${binary:Version}) + libarrow-python13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides C++ header files for Python support. @@ -97,12 +98,12 @@ Multi-Arch: same Depends: ${misc:Depends}, libarrow-dev (= ${binary:Version}), - libarrow-cuda12 (= ${binary:Version}) + libarrow-cuda13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides C++ header files for CUDA support. -Package: libplasma12 +Package: libgandiva13 Section: libs Architecture: any Multi-Arch: same @@ -110,7 +111,34 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow-cuda12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides C++ library files. + +Package: libgandiva-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libarrow-dev (= ${binary:Version}), + libgandiva13 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides C++ header files. + +Package: libplasma13 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-cuda13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides C++ library files to connect plasma_store_server. @@ -122,7 +150,7 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libplasma12 (= ${binary:Version}) + libplasma13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides plasma_store_server. @@ -134,12 +162,12 @@ Multi-Arch: same Depends: ${misc:Depends}, libarrow-cuda-dev (= ${binary:Version}), - libplasma12 (= ${binary:Version}) + libplasma13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides C++ header files. -Package: libparquet12 +Package: libparquet13 Section: libs Architecture: any Multi-Arch: same @@ -158,12 +186,12 @@ Multi-Arch: same Depends: ${misc:Depends}, libarrow-dev (= ${binary:Version}), - libparquet12 (= ${binary:Version}) + libparquet13 (= ${binary:Version}) Description: Apache Parquet is a columnar storage format . This package provides C++ header files. -Package: libarrow-glib12 +Package: libarrow-glib13 Section: libs Architecture: any Multi-Arch: same @@ -171,7 +199,7 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow12 (= ${binary:Version}) + libarrow13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides GLib based library files. @@ -195,7 +223,7 @@ Depends: ${misc:Depends}, libglib2.0-dev, libarrow-dev (= ${binary:Version}), - libarrow-glib12 (= ${binary:Version}), + libarrow-glib13 (= ${binary:Version}), gir1.2-arrow-1.0 (= ${binary:Version}) Suggests: libarrow-glib-doc Description: Apache Arrow is a data processing library for analysis @@ -213,7 +241,7 @@ Description: Apache Arrow is a data processing library for analysis . This package provides documentations. -Package: libarrow-cuda-glib12 +Package: libarrow-cuda-glib13 Section: libs Architecture: any Multi-Arch: same @@ -221,8 +249,8 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow-glib12 (= ${binary:Version}), - libarrow-cuda12 (= ${binary:Version}) + libarrow-glib13 (= ${binary:Version}), + libarrow-cuda13 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides GLib based library files for CUDA support. @@ -246,13 +274,67 @@ Depends: ${misc:Depends}, libarrow-cuda-dev (= ${binary:Version}), libarrow-glib-dev (= ${binary:Version}), - libarrow-cuda-glib12 (= ${binary:Version}), + libarrow-cuda-glib13 (= ${binary:Version}), gir1.2-arrow-cuda-1.0 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . This package provides GLib based header files for CUDA support. -Package: libplasma-glib12 +Package: libgandiva-glib13 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-glib13 (= ${binary:Version}), + libgandiva13 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GLib based library files. + +Package: gir1.2-gandiva-1.0 +Section: introspection +Architecture: any +Multi-Arch: same +Depends: + ${gir:Depends}, + ${misc:Depends} +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GObject Introspection typelib files. + +Package: libgandiva-glib-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libgandiva-dev (= ${binary:Version}), + libarrow-glib-dev (= ${binary:Version}), + libgandiva-glib13 (= ${binary:Version}), + gir1.2-gandiva-1.0 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GLib based header files. + +Package: libgandiva-glib-doc +Section: doc +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends} +Recommends: libglib2.0-doc +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides documentations. + +Package: libplasma-glib13 Section: libs Architecture: any Multi-Arch: same @@ -260,8 +342,8 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow-cuda-glib12 (= ${binary:Version}), - libplasma12 (= ${binary:Version}) + libarrow-cuda-glib13 (= ${binary:Version}), + libplasma13 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . This package provides GLib based library files to connect plasma_store_server. @@ -285,7 +367,7 @@ Depends: ${misc:Depends}, libplasma-dev (= ${binary:Version}), libarrow-cuda-glib-dev (= ${binary:Version}), - libplasma-glib12 (= ${binary:Version}), + libplasma-glib13 (= ${binary:Version}), gir1.2-plasma-1.0 (= ${binary:Version}) Description: Plasma is an in-memory object store and cache for big data. . @@ -302,7 +384,7 @@ Description: Plasma is an in-memory object store and cache for big data. . This package provides documentations. -Package: libparquet-glib12 +Package: libparquet-glib13 Section: libs Architecture: any Multi-Arch: same @@ -310,8 +392,8 @@ Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends}, - libarrow-glib12 (= ${binary:Version}), - libparquet12 (= ${binary:Version}) + libarrow-glib13 (= ${binary:Version}), + libparquet13 (= ${binary:Version}) Description: Apache Parquet is a columnar storage format . This package provides GLib based library files. @@ -335,7 +417,7 @@ Depends: ${misc:Depends}, libarrow-glib-dev (= ${binary:Version}), libparquet-dev (= ${binary:Version}), - libparquet-glib12 (= ${binary:Version}), + libparquet-glib13 (= ${binary:Version}), gir1.2-parquet-1.0 (= ${binary:Version}) Suggests: libparquet-glib-doc Description: Apache Parquet is a columnar storage format diff --git a/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install b/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install new file mode 100644 index 0000000000000..0433b367a24c8 --- /dev/null +++ b/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install @@ -0,0 +1 @@ +usr/lib/*/girepository-1.0/Gandiva-1.0.typelib diff --git a/dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install b/dev/tasks/linux-packages/debian/libarrow-cuda-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install rename to dev/tasks/linux-packages/debian/libarrow-cuda-glib13.install diff --git a/dev/tasks/linux-packages/debian/libarrow-cuda12.install b/dev/tasks/linux-packages/debian/libarrow-cuda13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libarrow-cuda12.install rename to dev/tasks/linux-packages/debian/libarrow-cuda13.install diff --git a/dev/tasks/linux-packages/debian/libarrow-glib12.install b/dev/tasks/linux-packages/debian/libarrow-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libarrow-glib12.install rename to dev/tasks/linux-packages/debian/libarrow-glib13.install diff --git a/dev/tasks/linux-packages/debian/libarrow-python12.install b/dev/tasks/linux-packages/debian/libarrow-python13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libarrow-python12.install rename to dev/tasks/linux-packages/debian/libarrow-python13.install diff --git a/dev/tasks/linux-packages/debian/libarrow12.install b/dev/tasks/linux-packages/debian/libarrow13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libarrow12.install rename to dev/tasks/linux-packages/debian/libarrow13.install diff --git a/dev/tasks/linux-packages/debian/libgandiva-dev.install b/dev/tasks/linux-packages/debian/libgandiva-dev.install new file mode 100644 index 0000000000000..1e5d264378e69 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/libgandiva.a +usr/lib/*/libgandiva.so +usr/lib/*/pkgconfig/gandiva.pc diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install b/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install new file mode 100644 index 0000000000000..4189dac66ed90 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install @@ -0,0 +1,5 @@ +usr/include/gandiva-glib/ +usr/lib/*/libgandiva-glib.a +usr/lib/*/libgandiva-glib.so +usr/lib/*/pkgconfig/gandiva-glib.pc +usr/share/gir-1.0/Gandiva-1.0.gir diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base new file mode 100644 index 0000000000000..bed6a124c5e08 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base @@ -0,0 +1,9 @@ +Document: gandiva-glib +Title: Gandiva GLib Reference Manual +Author: The Apache Software Foundation +Abstract: Gandiva GLib is a toolset for compiling and evaluating expressions on Arrow Data that uses GLib. +Section: Programming + +Format: HTML +Index: /usr/share/doc/libarrow-glib-doc/gandiva-glib/index.html +Files: /usr/share/doc/libarrow-glib-doc/gandiva-glib/*.html diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install new file mode 100644 index 0000000000000..54d2d066c275a --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install @@ -0,0 +1 @@ +usr/share/doc/libarrow-glib-doc/gandiva-glib/ diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links new file mode 100644 index 0000000000000..291b004ed717a --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links @@ -0,0 +1,3 @@ +usr/share/doc/libglib2.0-doc/glib usr/share/doc/libgandiva-glib-doc/glib +usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libgandiva-glib-doc/gobject +usr/share/doc/libarrow-glib-doc/gandiva-glib usr/share/gtk-doc/html/gandiva-glib diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib13.install b/dev/tasks/linux-packages/debian/libgandiva-glib13.install new file mode 100644 index 0000000000000..6257fd43823c0 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib13.install @@ -0,0 +1 @@ +usr/lib/*/libgandiva-glib.so.* diff --git a/dev/tasks/linux-packages/debian/libgandiva13.install b/dev/tasks/linux-packages/debian/libgandiva13.install new file mode 100644 index 0000000000000..38a05876db6e6 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva13.install @@ -0,0 +1,2 @@ +usr/lib/*/libgandiva.so.* +usr/lib/*/gandiva/ diff --git a/dev/tasks/linux-packages/debian/libparquet-glib12.install b/dev/tasks/linux-packages/debian/libparquet-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libparquet-glib12.install rename to dev/tasks/linux-packages/debian/libparquet-glib13.install diff --git a/dev/tasks/linux-packages/debian/libparquet12.install b/dev/tasks/linux-packages/debian/libparquet13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libparquet12.install rename to dev/tasks/linux-packages/debian/libparquet13.install diff --git a/dev/tasks/linux-packages/debian/libplasma-glib12.install b/dev/tasks/linux-packages/debian/libplasma-glib13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libplasma-glib12.install rename to dev/tasks/linux-packages/debian/libplasma-glib13.install diff --git a/dev/tasks/linux-packages/debian/libplasma12.install b/dev/tasks/linux-packages/debian/libplasma13.install similarity index 100% rename from dev/tasks/linux-packages/debian/libplasma12.install rename to dev/tasks/linux-packages/debian/libplasma13.install diff --git a/dev/tasks/linux-packages/debian/rules b/dev/tasks/linux-packages/debian/rules index f3cc2a045c1ee..d82f306cd2656 100755 --- a/dev/tasks/linux-packages/debian/rules +++ b/dev/tasks/linux-packages/debian/rules @@ -24,12 +24,13 @@ override_dh_auto_configure: --builddirectory=cpp_build \ -- \ -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DARROW_BUILD_TESTS=OFF \ -DARROW_PYTHON=ON \ -DARROW_BOOST_USE_SHARED=ON \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_PLASMA=ON \ + -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_JAVA=OFF \ -DPROTOBUF_HOME=/usr \ -DARROW_PROTOBUF_USE_SHARED=ON \ -DPythonInterp_FIND_VERSION=ON \ diff --git a/dev/tasks/linux-packages/yum/arrow.spec.in b/dev/tasks/linux-packages/yum/arrow.spec.in index ad60dfbdde18e..9391ea583f338 100644 --- a/dev/tasks/linux-packages/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/yum/arrow.spec.in @@ -75,8 +75,7 @@ cd cpp/build %if %{use_parquet} -DARROW_PARQUET=ON \ %endif - -DARROW_PLASMA=ON \ - -DARROW_BUILD_TESTS=OFF + -DARROW_PLASMA=ON make %{?_smp_mflags} cd - @@ -398,6 +397,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Wed Jan 16 2019 Krisztián Szűcs - 0.12.0-1 +- New upstream release. + * Thu Oct 04 2018 Kouhei Sutou - 0.11.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/yum/centos-6/Dockerfile b/dev/tasks/linux-packages/yum/centos-6/Dockerfile index 8143b99efd180..c7de92296767a 100644 --- a/dev/tasks/linux-packages/yum/centos-6/Dockerfile +++ b/dev/tasks/linux-packages/yum/centos-6/Dockerfile @@ -20,14 +20,13 @@ FROM centos:6 ARG DEBUG ENV \ - SRPM_DOWNLOAD_URL=http://vault.centos.org/7.4.1708/os/Source/SPackages \ + SRPM_DOWNLOAD_URL=http://vault.centos.org/7.6.1810/os/Source/SPackages \ LIBARCHIVE_SRPM_BASE=libarchive-3.1.2-10.el7_2.src.rpm RUN \ quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ yum update -y ${quiet} && \ yum install -y ${quiet} \ - centos-release-scl \ epel-release && \ yum install -y \ autoconf268 \ @@ -43,9 +42,10 @@ RUN \ ~/rpmbuild/SPECS/libarchive.spec && \ yum install -y ${quiet} ~/rpmbuild/RPMS/*/libarchive-3.*.rpm && \ rm -rf ${LIBARCHIVE_SRPM_BASE} ~/rpmbuild/ && \ + yum install -y ${quiet} \ + centos-release-scl && \ yum install -y ${quiet} \ boost-devel \ - centos-release-scl \ cmake3 \ devtoolset-6 \ git \ diff --git a/dev/tasks/python-wheels/appveyor.yml b/dev/tasks/python-wheels/appveyor.yml index 016041a6c6701..be6ad302e1a5c 100644 --- a/dev/tasks/python-wheels/appveyor.yml +++ b/dev/tasks/python-wheels/appveyor.yml @@ -20,11 +20,11 @@ os: Visual Studio 2015 environment: ARCH: "64" GENERATOR: Visual Studio 14 2015 Win64 - NUMPY: "{{ numpy_version }}" + NUMPY: "1.14.5" PYTHON: "{{ python_version }}" MSVC_DEFAULT_OPTIONS: ON ARROW_SRC: C:\apache-arrow - PYARROW_VERSION: {{ arrow.version }} + PYARROW_VERSION: {{ arrow.no_rc_version }} PYARROW_REF: {{ arrow.head }} init: diff --git a/dev/tasks/python-wheels/linux-test.sh b/dev/tasks/python-wheels/linux-test.sh index 163730a9f38da..45efdb0c91e05 100755 --- a/dev/tasks/python-wheels/linux-test.sh +++ b/dev/tasks/python-wheels/linux-test.sh @@ -24,11 +24,18 @@ pip install /arrow/python/manylinux1/dist/*.whl python --version # Test optional dependencies -python -c "import pyarrow" -python -c "import pyarrow.orc" -python -c "import pyarrow.parquet" -python -c "import pyarrow.plasma" +command=" +import sys +import pyarrow +import pyarrow.orc +import pyarrow.parquet +import pyarrow.plasma + +if sys.version_info.major > 2: + import pyarrow.gandiva +" +python -c "$command" # Run pyarrow tests -pip install pytest pandas +pip install -r /arrow/python/requirements-test.txt pytest --pyargs pyarrow diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh index 5c69904ff4348..22c44c157337f 100755 --- a/dev/tasks/python-wheels/osx-build.sh +++ b/dev/tasks/python-wheels/osx-build.sh @@ -99,9 +99,8 @@ function build_wheel { # build will also work with newer NumPy versions. export ARROW_HOME=`pwd`/arrow-dist export PARQUET_HOME=`pwd`/arrow-dist - if [ -n "$BUILD_DEPENDS" ]; then - pip install $(pip_opts) $BUILD_DEPENDS - fi + + pip install $(pip_opts) -r python/requirements-wheel.txt cython pushd cpp mkdir build @@ -161,10 +160,6 @@ function install_run { wheelhouse="$PWD/python/dist" - # Install test dependencies and built wheel - if [ -n "$TEST_DEPENDS" ]; then - pip install $(pip_opts) $TEST_DEPENDS - fi # Install compatible wheel pip install $(pip_opts) \ $(python $multibuild_dir/supported_wheels.py $wheelhouse/*.whl) @@ -179,7 +174,8 @@ function install_run { python -c "import pyarrow.plasma" # Run pyarrow tests - pip install pytest pytest-faulthandler + pip install $(pip_opts) -r python/requirements-test.txt + py.test --pyargs pyarrow popd diff --git a/dev/tasks/python-wheels/travis.linux.yml b/dev/tasks/python-wheels/travis.linux.yml index 9a8f804d1cc51..b5cbc65bc7e7e 100644 --- a/dev/tasks/python-wheels/travis.linux.yml +++ b/dev/tasks/python-wheels/travis.linux.yml @@ -40,9 +40,8 @@ script: # build wheel - pushd arrow/python/manylinux1 - docker run --shm-size=2g - -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.version }} + -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} -e PYTHON_VERSIONS="{{ python_version }},{{ unicode_width }}" - -e WHEEL_VERSION={{ wheel_version }} -v $PWD:/io -v $PWD/../../:/arrow quay.io/xhochy/arrow_manylinux1_x86_64_base:latest /io/build_arrow.sh diff --git a/dev/tasks/python-wheels/travis.osx.yml b/dev/tasks/python-wheels/travis.osx.yml index 2f0d168a3fb46..a98841335e728 100644 --- a/dev/tasks/python-wheels/travis.osx.yml +++ b/dev/tasks/python-wheels/travis.osx.yml @@ -26,11 +26,9 @@ env: - PLAT=x86_64 - TRAVIS_TAG={{ task.tag }} - MACOSX_DEPLOYMENT_TARGET="10.9" - - PYARROW_VERSION={{ arrow.version }} + - PYARROW_VERSION={{ arrow.no_rc_version }} - PYARROW_BUILD_VERBOSE=1 - MB_PYTHON_VERSION={{ python_version }} - - BUILD_DEPENDS="wheel=={{ wheel_version }} numpy=={{ numpy_version }} cython==0.27.3 six" - - TEST_DEPENDS="numpy=={{ numpy_version }} pandas=={{ pandas_version }} six" before_install: - git clone https://github.com/matthew-brett/multibuild # TODO pin it diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat index 22e306ab1f1eb..f85c8e8b7490e 100644 --- a/dev/tasks/python-wheels/win-build.bat +++ b/dev/tasks/python-wheels/win-build.bat @@ -82,7 +82,7 @@ popd @rem test the wheel call deactivate conda create -n wheel-test -q -y python=%PYTHON% ^ - numpy=%NUMPY% pandas pytest + numpy=%NUMPY% pandas pytest hypothesis call activate wheel-test pip install --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index bd49616f6bd3e..751420c3e6094 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -19,14 +19,14 @@ groups: # these groups are just for convenience # makes it easier to submit related tasks conda: - - conda-linux-py27 - - conda-linux-py35 - - conda-linux-py36 - - conda-osx-py27 - - conda-osx-py35 - - conda-osx-py36 - - conda-win-py35 - - conda-win-py36 + - conda-linux-gcc-py27 + - conda-linux-gcc-py36 + - conda-linux-gcc-py37 + - conda-osx-clang-py27 + - conda-osx-clang-py36 + - conda-osx-clang-py37 + - conda-win-vs2015-py36 + - conda-win-vs2015-py37 wheel: - wheel-linux-cp27m - wheel-linux-cp27mu @@ -60,85 +60,85 @@ tasks: # artifacts: list of regex patterns, each needs to match a single github # release asset, version variable is replaced in the pattern # e.g.: - # - pyarrow-{version}-py36(h[a-z0-9]+)_0-linux-64.tar.bz2 + # - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0-linux-64.tar.bz2 ############################## Conda Linux ################################## - conda-linux-py27: + conda-linux-gcc-py27: platform: linux template: conda-recipes/travis.linux.yml params: - variant_config_file: variants/linux_python2.7.yaml + config: linux_c_compilergcccxx_compilergxxpython2.7 artifacts: - - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 - conda-linux-py35: + conda-linux-gcc-py36: platform: linux template: conda-recipes/travis.linux.yml params: - variant_config_file: variants/linux_python3.5.yaml + config: linux_c_compilergcccxx_compilergxxpython3.6 artifacts: - - arrow-cpp-{version}-py35(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py35(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 - conda-linux-py36: + conda-linux-gcc-py37: platform: linux template: conda-recipes/travis.linux.yml params: - variant_config_file: variants/linux_python3.6.yaml + config: linux_c_compilergcccxx_compilergxxpython3.7 artifacts: - - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Conda OSX #################################### - conda-osx-py27: + conda-osx-clang-py27: platform: osx template: conda-recipes/travis.osx.yml params: - variant_config_file: variants/osx_python2.7.yaml + config: osx_c_compilerclangcxx_compilerclangxxpython2.7 artifacts: - - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 - conda-osx-py35: + conda-osx-clang-py36: platform: osx template: conda-recipes/travis.osx.yml params: - variant_config_file: variants/osx_python3.5.yaml + config: osx_c_compilerclangcxx_compilerclangxxpython3.6 artifacts: - - arrow-cpp-{version}-py35(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py35(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 - conda-osx-py36: + conda-osx-clang-py37: platform: osx template: conda-recipes/travis.osx.yml params: - variant_config_file: variants/osx_python3.6.yaml + config: osx_c_compilerclangcxx_compilerclangxxpython3.7 artifacts: - - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Conda Windows ################################ - conda-win-py35: + conda-win-vs2015-py36: platform: win template: conda-recipes/appveyor.yml params: - variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.5.yaml + config: win_c_compilervs2015cxx_compilervs2015python3.6 artifacts: - - arrow-cpp-{version}-py35_vc14(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py35(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36_vc14(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 - conda-win-py36: + conda-win-vs2015-py37: platform: win template: conda-recipes/appveyor.yml params: - variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.6.yaml + config: win_c_compilervs2015cxx_compilervs2015python3.7 artifacts: - - arrow-cpp-{version}-py36_vc14(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37_vc14(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Wheel Linux ################################## @@ -146,60 +146,55 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 2.7 unicode_width: 16 test_docker_images: [] artifacts: - - pyarrow-{version}-cp27-cp27m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp27-cp27m-manylinux1_x86_64.whl wheel-linux-cp27mu: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 2.7 unicode_width: 32 test_docker_images: - python:2.7-slim # debian ucs4 artifacts: - - pyarrow-{version}-cp27-cp27mu-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp27-cp27mu-manylinux1_x86_64.whl wheel-linux-cp35m: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.5 unicode_width: 16 test_docker_images: - python:3.5-slim artifacts: - - pyarrow-{version}-cp35-cp35m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp35-cp35m-manylinux1_x86_64.whl wheel-linux-cp36m: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.6 unicode_width: 16 test_docker_images: - python:3.6-slim artifacts: - - pyarrow-{version}-cp36-cp36m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp36-cp36m-manylinux1_x86_64.whl wheel-linux-cp37m: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.7 unicode_width: 16 test_docker_images: - python:3.7-slim artifacts: - - pyarrow-{version}-cp37-cp37m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp37-cp37m-manylinux1_x86_64.whl ############################## Wheel OSX #################################### @@ -207,45 +202,33 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 2.7 - wheel_version: 0.31.1 artifacts: - - pyarrow-{version}-cp27-cp27m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp27-cp27m-macosx_10_6_intel.whl wheel-osx-cp35m: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.5 - wheel_version: 0.31.1 artifacts: - - pyarrow-{version}-cp35-cp35m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp35-cp35m-macosx_10_6_intel.whl wheel-osx-cp36m: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.6 - wheel_version: 0.31.1 artifacts: - - pyarrow-{version}-cp36-cp36m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp36-cp36m-macosx_10_6_intel.whl wheel-osx-cp37m: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.7 - wheel_version: 0.31.1 artifacts: - - pyarrow-{version}-cp37-cp37m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp37-cp37m-macosx_10_6_intel.whl ############################## Wheel Windows ################################ @@ -253,28 +236,25 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.5 artifacts: - - pyarrow-{version}-cp35-cp35m-win_amd64.whl + - pyarrow-{no_rc_version}-cp35-cp35m-win_amd64.whl wheel-win-cp36m: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.6 artifacts: - - pyarrow-{version}-cp36-cp36m-win_amd64.whl + - pyarrow-{no_rc_version}-cp36-cp36m-win_amd64.whl wheel-win-cp37m: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.7 artifacts: - - pyarrow-{version}-cp37-cp37m-win_amd64.whl + - pyarrow-{no_rc_version}-cp37-cp37m-win_amd64.whl ############################## Linux PKGS #################################### @@ -294,38 +274,46 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - - libarrow-glib12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow-glib12_{no_rc_version}-1_amd64.deb + - libarrow-glib13-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow-glib13_{no_rc_version}-1_amd64.deb - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-cuda-glib12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb - - libarrow-cuda12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow-cuda12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib13-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib13_{no_rc_version}-1_amd64.deb + - libarrow-cuda13-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow-cuda13_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - - libarrow-python12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow-python12_{no_rc_version}-1_amd64.deb - - libarrow12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow12_{no_rc_version}-1_amd64.deb + - libarrow-python13-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow-python13_{no_rc_version}-1_amd64.deb + - libarrow13-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow13_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib13-dbgsym_{no_rc_version}-1_amd64.deb + - libgandiva-glib13_{no_rc_version}-1_amd64.deb + - libgandiva13-dbgsym_{no_rc_version}-1_amd64.deb + - libgandiva13_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb - - libparquet-glib12-dbgsym_{no_rc_version}-1_amd64.deb - - libparquet-glib12_{no_rc_version}-1_amd64.deb - - libparquet12-dbgsym_{no_rc_version}-1_amd64.deb - - libparquet12_{no_rc_version}-1_amd64.deb + - libparquet-glib13-dbgsym_{no_rc_version}-1_amd64.deb + - libparquet-glib13_{no_rc_version}-1_amd64.deb + - libparquet13-dbgsym_{no_rc_version}-1_amd64.deb + - libparquet13_{no_rc_version}-1_amd64.deb - libplasma-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-doc_{no_rc_version}-1_all.deb - - libplasma-glib12-dbgsym_{no_rc_version}-1_amd64.deb - - libplasma-glib12_{no_rc_version}-1_amd64.deb - - libplasma12-dbgsym_{no_rc_version}-1_amd64.deb - - libplasma12_{no_rc_version}-1_amd64.deb + - libplasma-glib13-dbgsym_{no_rc_version}-1_amd64.deb + - libplasma-glib13_{no_rc_version}-1_amd64.deb + - libplasma13-dbgsym_{no_rc_version}-1_amd64.deb + - libplasma13_{no_rc_version}-1_amd64.deb ubuntu-trusty: platform: linux @@ -347,18 +335,18 @@ tasks: - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - - libarrow-glib12_{no_rc_version}-1_amd64.deb - - libarrow12_{no_rc_version}-1_amd64.deb + - libarrow-glib13_{no_rc_version}-1_amd64.deb + - libarrow13_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb - - libparquet-glib12_{no_rc_version}-1_amd64.deb - - libparquet12_{no_rc_version}-1_amd64.deb + - libparquet-glib13_{no_rc_version}-1_amd64.deb + - libparquet13_{no_rc_version}-1_amd64.deb - libplasma-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-doc_{no_rc_version}-1_all.deb - - libplasma-glib12_{no_rc_version}-1_amd64.deb - - libplasma12_{no_rc_version}-1_amd64.deb + - libplasma-glib13_{no_rc_version}-1_amd64.deb + - libplasma13_{no_rc_version}-1_amd64.deb ubuntu-xenial: platform: linux @@ -376,29 +364,35 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - - libarrow-glib12_{no_rc_version}-1_amd64.deb + - libarrow-glib13_{no_rc_version}-1_amd64.deb - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb - - libarrow-cuda12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib13_{no_rc_version}-1_amd64.deb + - libarrow-cuda13_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - - libarrow-python12_{no_rc_version}-1_amd64.deb - - libarrow12_{no_rc_version}-1_amd64.deb + - libarrow-python13_{no_rc_version}-1_amd64.deb + - libarrow13_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib13_{no_rc_version}-1_amd64.deb + - libgandiva13_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb - - libparquet-glib12_{no_rc_version}-1_amd64.deb - - libparquet12_{no_rc_version}-1_amd64.deb + - libparquet-glib13_{no_rc_version}-1_amd64.deb + - libparquet13_{no_rc_version}-1_amd64.deb - libplasma-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-doc_{no_rc_version}-1_all.deb - - libplasma-glib12_{no_rc_version}-1_amd64.deb - - libplasma12_{no_rc_version}-1_amd64.deb + - libplasma-glib13_{no_rc_version}-1_amd64.deb + - libplasma13_{no_rc_version}-1_amd64.deb ubuntu-bionic: platform: linux @@ -416,29 +410,35 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - - libarrow-glib12_{no_rc_version}-1_amd64.deb + - libarrow-glib13_{no_rc_version}-1_amd64.deb - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb - - libarrow-cuda12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib13_{no_rc_version}-1_amd64.deb + - libarrow-cuda13_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - - libarrow-python12_{no_rc_version}-1_amd64.deb - - libarrow12_{no_rc_version}-1_amd64.deb + - libarrow-python13_{no_rc_version}-1_amd64.deb + - libarrow13_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib13_{no_rc_version}-1_amd64.deb + - libgandiva13_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb - - libparquet-glib12_{no_rc_version}-1_amd64.deb - - libparquet12_{no_rc_version}-1_amd64.deb + - libparquet-glib13_{no_rc_version}-1_amd64.deb + - libparquet13_{no_rc_version}-1_amd64.deb - libplasma-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-doc_{no_rc_version}-1_all.deb - - libplasma-glib12_{no_rc_version}-1_amd64.deb - - libplasma12_{no_rc_version}-1_amd64.deb + - libplasma-glib13_{no_rc_version}-1_amd64.deb + - libplasma13_{no_rc_version}-1_amd64.deb ubuntu-cosmic: platform: linux @@ -456,29 +456,35 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - - libarrow-glib12_{no_rc_version}-1_amd64.deb + - libarrow-glib13_{no_rc_version}-1_amd64.deb - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb - - libarrow-cuda12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib13_{no_rc_version}-1_amd64.deb + - libarrow-cuda13_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - - libarrow-python12_{no_rc_version}-1_amd64.deb - - libarrow12_{no_rc_version}-1_amd64.deb + - libarrow-python13_{no_rc_version}-1_amd64.deb + - libarrow13_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib13_{no_rc_version}-1_amd64.deb + - libgandiva13_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb - - libparquet-glib12_{no_rc_version}-1_amd64.deb - - libparquet12_{no_rc_version}-1_amd64.deb + - libparquet-glib13_{no_rc_version}-1_amd64.deb + - libparquet13_{no_rc_version}-1_amd64.deb - libplasma-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-dev_{no_rc_version}-1_amd64.deb - libplasma-glib-doc_{no_rc_version}-1_all.deb - - libplasma-glib12_{no_rc_version}-1_amd64.deb - - libplasma12_{no_rc_version}-1_amd64.deb + - libplasma-glib13_{no_rc_version}-1_amd64.deb + - libplasma13_{no_rc_version}-1_amd64.deb centos-6: platform: linux @@ -531,10 +537,10 @@ tasks: platform: linux template: gandiva-jars/travis.linux.yml artifacts: - - arrow-gandiva-{version}-SNAPSHOT.jar + - arrow-gandiva-{no_rc_version}-SNAPSHOT.jar gandiva-jar-osx: platform: osx template: gandiva-jars/travis.osx.yml artifacts: - - arrow-gandiva-{version}-SNAPSHOT.jar + - arrow-gandiva-{no_rc_version}-SNAPSHOT.jar diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index c158481de461e..32131abd9f2a1 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -19,9 +19,11 @@ groups: # these groups are just for convenience # makes it easier to submit related tasks docker: + - docker-r - docker-rust - docker-cpp - docker-cpp-alpine + - docker-cpp-cmake32 - docker-c_glib - docker-go - docker-python-2.7 @@ -31,19 +33,25 @@ groups: - docker-python-3.6-alpine - docker-java - docker-js + - docker-docs - docker-lint - docker-iwyu - docker-clang-format - - docker-hdfs-integration - docker-pandas-master + - docker-dask-integration + - docker-hdfs-integration + - docker-spark-integration integration: - - docker-hdfs-integration - docker-pandas-master + - docker-dask-integration + - docker-hdfs-integration + - docker-spark-integration cpp-python: - docker-cpp - docker-cpp-alpine + - docker-cpp-cmake32 - docker-python-2.7 - docker-python-2.7-alpine - docker-python-3.6 @@ -62,6 +70,15 @@ tasks: ############################## Language containers ######################### + docker-r: + platform: linux + template: docker-tests/travis.linux.yml + params: + commands: + - docker-compose build cpp + - docker-compose build r + - docker-compose run r + docker-rust: platform: linux template: docker-tests/travis.linux.yml @@ -86,6 +103,14 @@ tasks: - docker-compose build cpp-alpine - docker-compose run cpp-alpine + docker-cpp-cmake32: + platform: linux + template: docker-tests/travis.linux.yml + params: + commands: + - docker-compose build cpp-cmake32 + - docker-compose run cpp-cmake32 + docker-c_glib: platform: linux template: docker-tests/travis.linux.yml @@ -174,6 +199,20 @@ tasks: - docker-compose build python-alpine - docker-compose run python-alpine + ###################### Documentation building tests ######################### + + docker-docs: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build docs + - docker-compose run docs + ############################## Linter tests ################################# docker-lint: @@ -214,14 +253,42 @@ tasks: ############################## Integration tests ############################ + docker-dask-integration: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build dask-integration + - docker-compose run dask-integration + docker-hdfs-integration: platform: linux template: docker-tests/travis.linux.yml params: + environment: + PYTHON_VERSION: 3.6 commands: + - docker-compose build cpp + - docker-compose build python - docker-compose build hdfs-integration - docker-compose run hdfs-integration + docker-spark-integration: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build spark-integration + - docker-compose run spark-integration + docker-pandas-master: platform: linux template: docker-tests/travis.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index d6f11004233e5..b93fed74b18d1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,18 +19,31 @@ version: '3.5' +# TODO(kszucs): set arrow's mount to :ro mode, once all of the builds are +# passing without write access to the source directory. The following builds +# are contaminating the source directory: +# - docs +# - python-alpine (writes .egg directory) +# - rust (writes Cargo.lock) +# - java (without the rsync trick) + x-ubuntu-volumes: &ubuntu-volumes - .:/arrow:delegated - - ${ARROW_DOCKER_CACHE_DIR:-./docker_cache}/ubuntu:/build:delegated + - ubuntu-cache:/build:delegated x-alpine-volumes: &alpine-volumes - .:/arrow:delegated - - ${ARROW_DOCKER_CACHE_DIR:-./docker_cache}/alpine:/build:delegated + - alpine-cache:/build:delegated -services: +volumes: + ubuntu-cache: + alpine-cache: + maven-cache: + spark-cache: +services: ######################### Language Containers ############################### c_glib: @@ -57,6 +70,22 @@ services: PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data volumes: *ubuntu-volumes + cpp-cmake32: + # Usage: + # docker-compose build cpp-cmake32 + # docker-compose run cpp-cmake32 + image: arrow:cpp-cmake32 + shm_size: 2G + build: + context: . + dockerfile: cpp/Dockerfile + args: + EXTRA_CONDA_PKGS: cmake=3.2 + environment: + ARROW_ORC: "OFF" + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data + volumes: *ubuntu-volumes + cpp-alpine: # Usage: # docker-compose build cpp-alpine @@ -89,8 +118,8 @@ services: context: . dockerfile: java/Dockerfile volumes: - - .:/arrow:delegated - - $HOME/.m2:/root/.m2:delegated + - .:/arrow:ro # ensures that docker won't contaminate the host directory + - maven-cache:/root/.m2:delegated js: image: arrow:js @@ -136,6 +165,8 @@ services: build: context: . dockerfile: rust/Dockerfile + environment: + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data volumes: *ubuntu-volumes r: @@ -156,6 +187,9 @@ services: lint: # Usage: + # export PYTHON_VERSION=3.6 + # docker-compose build cpp + # docker-compose build python # docker-compose build lint # docker-compose run lint image: arrow:lint @@ -167,6 +201,9 @@ services: iwyu: # Usage: + # export PYTHON_VERSION=3.6 + # docker-compose build cpp + # docker-compose build python # docker-compose build lint # docker-compose run iwyu image: arrow:lint @@ -196,7 +233,7 @@ services: build: context: . dockerfile: docs/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes ######################### Integration Tests ################################# @@ -206,6 +243,20 @@ services: # - "21050" # hostname: impala + pandas-master: + # Usage: + # export PYTHON_VERSION=3.6 + # docker-compose build cpp + # docker-compose build python + # docker-compose build --no-cache pandas-master + # docker-compose run pandas-master + image: arrow:pandas-master + build: + context: . + dockerfile: integration/pandas/Dockerfile + shm_size: 2G + volumes: *ubuntu-volumes + hdfs-namenode: image: gelog/hadoop shm_size: 2G @@ -215,7 +266,17 @@ services: command: hdfs namenode hostname: hdfs-namenode - hdfs-datanode: + hdfs-datanode-1: + image: gelog/hadoop + command: hdfs datanode + ports: + # The host port is randomly assigned by Docker, to allow scaling + # to multiple DataNodes on the same host + - "50075" + links: + - hdfs-namenode:hdfs-namenode + + hdfs-datanode-2: image: gelog/hadoop command: hdfs datanode ports: @@ -232,9 +293,11 @@ services: # docker-compose build python # docker-compose build hdfs-integration # docker-compose run hdfs-integration + image: arrow:hdfs-${HDFS_VERSION:-2.6.5} links: - hdfs-namenode:hdfs-namenode - - hdfs-datanode:hdfs-datanode + - hdfs-datanode-1:hdfs-datanode-1 + - hdfs-datanode-2:hdfs-datanode-2 environment: - ARROW_HDFS_TEST_HOST=hdfs-namenode - ARROW_HDFS_TEST_PORT=9000 @@ -242,22 +305,41 @@ services: build: context: . dockerfile: integration/hdfs/Dockerfile + args: + HDFS_VERSION: ${HDFS_VERSION:-2.6.5} + volumes: *ubuntu-volumes - pandas-master: + # TODO(kszucs): pass dask version explicitly as a build argument + dask-integration: # Usage: # export PYTHON_VERSION=3.6 # docker-compose build cpp # docker-compose build python - # docker-compose build --no-cache pandas-master - # docker-compose run pandas-master - image: arrow:pandas-master + # docker-compose build dask-integration + # docker-compose run dask-integration + image: arrow:dask-integration build: context: . - dockerfile: integration/pandas/Dockerfile - shm_size: 2G + dockerfile: integration/dask/Dockerfile volumes: *ubuntu-volumes + spark-integration: + # Usage: + # export PYTHON_VERSION=3.6 + # docker-compose build cpp + # docker-compose build python + # docker-compose build spark-integration + # docker-compose run spark-integration + image: arrow:spark-${SPARK_VERSION:-2.4.0} + environment: + - SPARK_VERSION=${SPARK_VERSION:-2.4.0} + build: + context: . + dockerfile: integration/spark/Dockerfile + volumes: + - .:/arrow:ro # ensures that docker won't contaminate the host directory + - ubuntu-cache:/build:delegated + - maven-cache:/root/.m2:delegated + - spark-cache:/spark:delegated - # TODO(kszucs): dask-integration # TODO(kszucs): hive-integration - # TODO(kszucs): spark-integration diff --git a/docs/Dockerfile b/docs/Dockerfile index 4908110b7fb56..d9441c2441868 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -18,9 +18,9 @@ FROM arrow:python-3.6 ADD ci/conda_env_sphinx.yml /arrow/ci/ -RUN conda install -c conda-forge \ - --file arrow/ci/conda_env_sphinx.yml && \ +RUN conda install --file arrow/ci/conda_env_sphinx.yml && \ conda clean --all + CMD arrow/ci/docker_build_cpp.sh && \ arrow/ci/docker_build_python.sh && \ arrow/ci/docker_build_sphinx.sh diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000..4430d65cebb11 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,30 @@ + + +# Apache Arrow Documentation + +This directory contains source files for building the main project +documentation. This includes the [Arrow columnar format specification][2]. + +Instructions for building the documentation site are found in +[docs/source/building.rst][1]. The build depends on the API +documentation for some of the project subcomponents. + +[1]: https://github.com/apache/arrow/blob/master/docs/source/building.rst +[2]: https://github.com/apache/arrow/tree/master/docs/source/format \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000000..36f2086c20b3f --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,52 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt index 7e33455de0e9b..77ca6574c5356 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,5 @@ breathe ipython -matplotlib numpydoc sphinx sphinx_rtd_theme diff --git a/docs/source/building.rst b/docs/source/building.rst new file mode 100644 index 0000000000000..c6ff97424fcfb --- /dev/null +++ b/docs/source/building.rst @@ -0,0 +1,90 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _building-docs: + +Building the Documentation +========================== + +Prerequisites +------------- + +The documentation build process uses `Doxygen `_ and +`Sphinx `_ along with a few extensions. + +If you're using Conda, the required software can be installed in a single line: + +.. code-block:: shell + + conda install -c conda-forge --file=ci/conda_env_sphinx.yml + +Otherwise, you'll first need to install `Doxygen `_ +yourself (for example from your distribution's official repositories, if +using Linux). Then you can install the Python-based requirements with the +following command: + +.. code-block:: shell + + pip install -r docs/requirements.txt + +Building +-------- + +.. note:: + + If you are building the documentation on Windows, not all sections + may build properly. + +These two steps are mandatory and must be executed in order. + +#. Process the C++ API using Doxygen + + .. code-block:: shell + + pushd cpp/apidoc + doxygen + popd + +#. Build the complete documentation using Sphinx + + .. code-block:: shell + + pushd docs + make html + popd + +After these steps are completed, the documentation is rendered in HTML +format in ``docs/_build/html``. In particular, you can point your browser +at ``docs/_build/html/index.html`` to read the docs and review any changes +you made. + + +.. _building-docker: + +Building with Docker +-------------------- + +You can use Docker to build the documentation: + +.. code-block:: shell + + docker-compose build cpp + docker-compose build python + docker-compose build docs + docker-compose run docs + +The final output is located under ``docs/_build/html``. diff --git a/docs/source/conf.py b/docs/source/conf.py index 1cadef18b64f2..d525fa943138b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -53,6 +53,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', + 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', @@ -69,6 +70,9 @@ 'inherited-members': None } +# Overriden conditionally below +autodoc_mock_imports = [] + # ipython directive options ipython_mplbackend = '' @@ -387,3 +391,32 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. # # texinfo_no_detailmenu = False + + +# -- Customization -------------------------------------------------------- + +# Conditional API doc generation + +# Sphinx has two features for conditional inclusion: +# - The "only" directive +# https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#including-content-based-on-tags +# - The "ifconfig" extension +# https://www.sphinx-doc.org/en/master/usage/extensions/ifconfig.html +# +# Both have issues, but "ifconfig" seems to work in this setting. + +try: + import pyarrow.cuda + cuda_enabled = True +except ImportError: + cuda_enabled = False + # Mock pyarrow.cuda to avoid autodoc warnings. + # XXX I can't get autodoc_mock_imports to work, so mock manually instead + # (https://github.com/sphinx-doc/sphinx/issues/2174#issuecomment-453177550) + from unittest import mock + pyarrow.cuda = sys.modules['pyarrow.cuda'] = mock.Mock() + +def setup(app): + # Use a config value to indicate whether CUDA API docs can be generated. + # This will also rebuild appropriately when the value changes. + app.add_config_value('cuda_enabled', cuda_enabled, 'env') diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 894ed1f907f6d..522609e85aacd 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -20,8 +20,13 @@ API Reference ************* .. toctree:: - :maxdepth: 2 - :caption: Getting Started + :maxdepth: 3 - api/array + api/support api/memory + api/datatype + api/array + api/builder + api/table + api/utilities + api/cuda diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index aed18763b6ce7..bb981d1a0477d 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -15,19 +15,23 @@ .. specific language governing permissions and limitations .. under the License. -Array types -============= +====== +Arrays +====== .. doxygenclass:: arrow::Array :project: arrow_cpp :members: +Concrete array subclasses +========================= + .. doxygenclass:: arrow::DictionaryArray :project: arrow_cpp :members: -non-nested array types ----------------------- +Non-nested +---------- .. doxygenclass:: arrow::FlatArray :project: arrow_cpp @@ -65,8 +69,8 @@ non-nested array types :project: arrow_cpp :members: -nested array types ------------------- +Nested +------ .. doxygenclass:: arrow::UnionArray :project: arrow_cpp @@ -79,3 +83,10 @@ nested array types .. doxygenclass:: arrow::StructArray :project: arrow_cpp :members: + +Chunked Arrays +============== + +.. doxygenclass:: arrow::ChunkedArray + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/builder.rst b/docs/source/cpp/api/builder.rst new file mode 100644 index 0000000000000..9e6540aa557fb --- /dev/null +++ b/docs/source/cpp/api/builder.rst @@ -0,0 +1,56 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Array Builders +============== + +.. doxygenclass:: arrow::ArrayBuilder + :members: + +Concrete builder subclasses +=========================== + +.. doxygenclass:: arrow::NullBuilder + :members: + +.. doxygenclass:: arrow::BooleanBuilder + :members: + +.. doxygenclass:: arrow::NumericBuilder + :members: + +.. doxygenclass:: arrow::BinaryBuilder + :members: + +.. doxygenclass:: arrow::StringBuilder + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryBuilder + :members: + +.. doxygenclass:: arrow::Decimal128Builder + :members: + +.. doxygenclass:: arrow::ListBuilder + :members: + +.. doxygenclass:: arrow::StructBuilder + :members: + +.. doxygenclass:: arrow::DictionaryBuilder + :members: diff --git a/docs/source/cpp/api/cuda.rst b/docs/source/cpp/api/cuda.rst new file mode 100644 index 0000000000000..e0b90e3a51357 --- /dev/null +++ b/docs/source/cpp/api/cuda.rst @@ -0,0 +1,69 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +CUDA support +============ + +CUDA Contexts +============= + +.. doxygenclass:: arrow::cuda::CudaDeviceManager + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::cuda::CudaContext + :project: arrow_cpp + :members: + +Device and Host Buffers +======================= + +.. doxygenclass:: arrow::cuda::CudaBuffer + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::cuda::AllocateCudaHostBuffer + :project: arrow_cpp + +.. doxygenclass:: arrow::cuda::CudaHostBuffer + :project: arrow_cpp + :members: + +Device Memory Input / Output +============================ + +.. doxygenclass:: arrow::cuda::CudaBufferReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::cuda::CudaBufferWriter + :project: arrow_cpp + :members: + +CUDA IPC +======== + +.. doxygenclass:: arrow::cuda::CudaIpcMemHandle + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::cuda::SerializeRecordBatch + :project: arrow_cpp + +.. doxygenfunction:: arrow::cuda::ReadRecordBatch + :project: arrow_cpp diff --git a/docs/source/cpp/api/datatype.rst b/docs/source/cpp/api/datatype.rst new file mode 100644 index 0000000000000..adfc6e4171e66 --- /dev/null +++ b/docs/source/cpp/api/datatype.rst @@ -0,0 +1,148 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========== +Data Types +========== + +.. doxygenenum:: arrow::Type::type + +.. doxygenclass:: arrow::DataType + :members: + +.. _api-type-factories: + +Factory functions +================= + +These functions are recommended for creating data types. They may return +new objects or existing singletons, depending on the type requested. + +.. doxygengroup:: type-factories + :project: arrow_cpp + :content-only: + +Concrete type subclasses +======================== + +Primitive +--------- + +.. doxygenclass:: arrow::NullType + :members: + +.. doxygenclass:: arrow::BooleanType + :members: + +.. doxygenclass:: arrow::Int8Type + :members: + +.. doxygenclass:: arrow::Int16Type + :members: + +.. doxygenclass:: arrow::Int32Type + :members: + +.. doxygenclass:: arrow::Int64Type + :members: + +.. doxygenclass:: arrow::UInt8Type + :members: + +.. doxygenclass:: arrow::UInt16Type + :members: + +.. doxygenclass:: arrow::UInt32Type + :members: + +.. doxygenclass:: arrow::UInt64Type + :members: + +.. doxygenclass:: arrow::HalfFloatType + :members: + +.. doxygenclass:: arrow::FloatType + :members: + +.. doxygenclass:: arrow::DoubleType + :members: + +Time-related +------------ + +.. doxygenenum:: arrow::TimeUnit::type + +.. doxygenclass:: arrow::Date32Type + :members: + +.. doxygenclass:: arrow::Date64Type + :members: + +.. doxygenclass:: arrow::Time32Type + :members: + +.. doxygenclass:: arrow::Time64Type + :members: + +.. doxygenclass:: arrow::TimestampType + :members: + +Binary-like +----------- + +.. doxygenclass:: arrow::BinaryType + :members: + +.. doxygenclass:: arrow::StringType + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryType + :members: + +.. doxygenclass:: arrow::Decimal128Type + :members: + +Nested +------ + +.. doxygenclass:: arrow::ListType + :members: + +.. doxygenclass:: arrow::StructType + :members: + +.. doxygenclass:: arrow::UnionType + :members: + +Dictionary-encoded +------------------ + +.. doxygenclass:: arrow::DictionaryType + :members: + +Fields and Schemas +================== + +.. doxygengroup:: schema-factories + :project: arrow_cpp + :content-only: + +.. doxygenclass:: arrow::Field + :members: + +.. doxygenclass:: arrow::Schema + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index fbb5dc818628c..c921229e6cb17 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -18,8 +18,8 @@ Memory (management) =================== -Basic containers ----------------- +Buffers +------- .. doxygenclass:: arrow::Buffer :project: arrow_cpp @@ -33,16 +33,11 @@ Basic containers :project: arrow_cpp :members: -.. doxygenclass:: arrow::BufferBuilder - :project: arrow_cpp - :members: - Memory Pools ------------ .. doxygenfunction:: arrow::default_memory_pool :project: arrow_cpp - :outline: .. doxygenclass:: arrow::MemoryPool :project: arrow_cpp @@ -55,3 +50,41 @@ Memory Pools .. doxygenclass:: arrow::ProxyMemoryPool :project: arrow_cpp :members: + +Allocation Functions +-------------------- + +These functions allocate a buffer from a particular memory pool. + +.. doxygengroup:: buffer-allocation-functions + :project: arrow_cpp + :content-only: + +Slicing +------- + +.. doxygengroup:: buffer-slicing-functions + :project: arrow_cpp + :content-only: + +Buffer Builders +--------------- + +.. doxygenclass:: arrow::BufferBuilder + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TypedBufferBuilder + :project: arrow_cpp + :members: + +STL Integration +--------------- + +.. doxygenclass:: arrow::stl_allocator + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::STLMemoryPool + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/support.rst b/docs/source/cpp/api/support.rst new file mode 100644 index 0000000000000..b165a9973b4c1 --- /dev/null +++ b/docs/source/cpp/api/support.rst @@ -0,0 +1,29 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Programming Support +=================== + +Error return and reporting +-------------------------- + +.. doxygenclass:: arrow::Status + :project: arrow_cpp + :members: + +.. doxygendefine:: ARROW_RETURN_NOT_OK + diff --git a/docs/source/cpp/api/table.rst b/docs/source/cpp/api/table.rst new file mode 100644 index 0000000000000..e8b4f8e066e30 --- /dev/null +++ b/docs/source/cpp/api/table.rst @@ -0,0 +1,52 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======================== +Two-dimensional Datasets +======================== + +Columns +======= + +.. doxygenclass:: arrow::Column + :project: arrow_cpp + :members: + +Tables +====== + +.. doxygenclass:: arrow::Table + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::ConcatenateTables + :project: arrow_cpp + +Record Batches +============== + +.. doxygenclass:: arrow::RecordBatch + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::RecordBatchReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TableBatchReader + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/utilities.rst b/docs/source/cpp/api/utilities.rst new file mode 100644 index 0000000000000..1c18a20425c24 --- /dev/null +++ b/docs/source/cpp/api/utilities.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========= +Utilities +========= + +Decimal Numbers +=============== + +.. doxygenclass:: arrow::Decimal128 + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/arrays.rst b/docs/source/cpp/arrays.rst new file mode 100644 index 0000000000000..0c5272d2aed5e --- /dev/null +++ b/docs/source/cpp/arrays.rst @@ -0,0 +1,211 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +====== +Arrays +====== + +The central type in Arrow is the class :class:`arrow::Array`. An array +represents a known-length sequence of values all having the same type. +Internally, those values are represented by one or several buffers, the +number and meaning of which depend on the array's data type, as documented +in :doc:`the Arrow data layout specification <../format/Layout>`. + +Those buffers consist of the value data itself and an optional bitmap buffer +that indicates which array entries are null values. The bitmap buffer +can be entirely omitted if the array is known to have zero null values. + +There are concrete subclasses of :class:`arrow::Array` for each data type, +that help you access individual values of the array. + +Building an array +================= + +As Arrow objects are immutable, there are classes provided that help you +build these objects incrementally from third-party data. These classes +are organized in a hierarchy around the :class:`arrow::ArrayBuilder` base class, +with concrete subclasses tailored for each particular data type. + +For example, to build an array of ``int64_t`` elements, we can use the +:class:`arrow::Int64Builder` class. In the following example, we build an array +of the range 1 to 8 where the element that should hold the value 4 is nulled:: + + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + if (!st.ok()) { + // ... do something on array building failure + } + +The resulting Array (which can be casted to the concrete :class:`arrow::Int64Array` +subclass if you want to access its values) then consists of two +:class:`arrow::Buffer`\s. +The first buffer holds the null bitmap, which consists here of a single byte with +the bits ``0|0|0|0|1|0|0|0``. As we use `least-significant bit (LSB) numbering`_. +this indicates that the fourth entry in the array is null. The second +buffer is simply an ``int64_t`` array containing all the above values. +As the fourth entry is null, the value at that position in the buffer is +undefined. + +Here is how you could access the concrete array's contents:: + + // Cast the Array to its actual type to access its data + auto int64_array = std::static_pointer_cast(array); + + // Get the pointer to the null bitmap. + const uint8_t* null_bitmap = int64_array->null_bitmap_data(); + + // Get the pointer to the actual data + const int64_t* data = int64_array->raw_values(); + + // Alternatively, given an array index, query its null bit and value directly + int64_t index = 2; + if (!int64_array->IsNull(index)) { + int64_t value = int64_array->Value(index); + } + +.. note:: + :class:`arrow::Int64Array` (respectively :class:`arrow::Int64Builder`) is + just a ``typedef``, provided for convenience, of ``arrow::NumericArray`` + (respectively ``arrow::NumericBuilder``). + +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering + +Performance +----------- + +While it is possible to build an array value-by-value as in the example above, +to attain highest performance it is recommended to use the bulk appending +methods (usually named ``AppendValues``) in the concrete :class:`arrow::ArrayBuilder` +subclasses. + +If you know the number of elements in advance, it is also recommended to +presize the working area by calling the :func:`~arrow::ArrayBuilder::Resize` +or :func:`~arrow::ArrayBuilder::Reserve` methods. + +Here is how one could rewrite the above example to take advantage of those +APIs:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Resize(8); + // Bulk append the given values (with a null in 4th place as indicated by the + // validity vector) + std::vector validity = {true, true, true, false, true, true, true, true}; + std::vector values = {1, 2, 3, 0, 5, 6, 7, 8}; + builder.AppendValues(values, validity); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + +If you still must append values one by one, some concrete builder subclasses +have methods marked "Unsafe" that assume the working area has been correctly +presized, and offer higher performance in exchange:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Resize(8); + builder.UnsafeAppend(1); + builder.UnsafeAppend(2); + builder.UnsafeAppend(3); + builder.UnsafeAppendNull(); + builder.UnsafeAppend(5); + builder.UnsafeAppend(6); + builder.UnsafeAppend(7); + builder.UnsafeAppend(8); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + + +Size Limitations and Recommendations +==================================== + +Some array types are structurally limited to 32-bit sizes. This is the case +for list arrays (which can hold up to 2^31 elements), string arrays and binary +arrays (which can hold up to 2GB of binary data), at least. Some other array +types can hold up to 2^63 elements in the C++ implementation, but other Arrow +implementations can have a 32-bit size limitation for those array types as well. + +For these reasons, it is recommended that huge data be chunked in subsets of +more reasonable size. + +Chunked Arrays +============== + +A :class:`arrow::ChunkedArray` is, like an array, a logical sequence of values; +but unlike a simple array, a chunked array does not require the entire sequence +to be physically contiguous in memory. Also, the constituents of a chunked array +need not have the same size, but they must all have the same data type. + +A chunked array is constructed by agregating any number of arrays. Here we'll +build a chunked array with the same logical values as in the example above, +but in two separate chunks:: + + std::vector> chunks; + std::shared_ptr array; + + // Build first chunk + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + // Build second chunk + builder.Reset(); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + auto chunked_array = std::make_shared(std::move(chunks)); + + assert(chunked_array->num_chunks() == 2); + // Logical length in number of values + assert(chunked_array->length() == 8); + assert(chunked_array->null_count() == 1); + +Slicing +======= + +Like for physical memory buffers, it is possible to make zero-copy slices +of arrays and chunked arrays, to obtain an array or chunked array referring +to some logical subsequence of the data. This is done by calling the +:func:`arrow::Array::Slice` and :func:`arrow::ChunkedArray::Slice` methods, +respectively. + diff --git a/docs/source/cpp/conventions.rst b/docs/source/cpp/conventions.rst new file mode 100644 index 0000000000000..b0424358901b4 --- /dev/null +++ b/docs/source/cpp/conventions.rst @@ -0,0 +1,91 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conventions +=========== + +The Arrow C++ API follows a few simple guidelines. As with many rules, +there may be exceptions. + +Language version +---------------- + +Arrow is C++11-compatible. A few backports are used for newer functionality, +for example the :class:`std::string_view` class. + +Namespacing +----------- + +All the Arrow API (except macros) is namespaced inside a ``arrow`` namespace, +and nested namespaces thereof. + +Safe pointers +------------- + +Arrow objects are usually passed and stored using safe pointers -- most of +the time :class:`std::shared_ptr` but sometimes also :class:`std::unique_ptr`. + +Immutability +------------ + +Many Arrow objects are immutable: once constructed, their logical properties +cannot change anymore. This makes it possible to use them in multi-threaded +scenarios without requiring tedious and error-prone synchronization. + +There are obvious exceptions to this, such as IO objects or mutable data buffers. + +Error reporting +--------------- + +Most APIs indicate a successful or erroneous outcome by returning a +:class:`arrow::Status` instance. Arrow doesn't throw exceptions of its +own, but third-party exceptions might propagate through, especially +:class:`std::bad_alloc` (but Arrow doesn't use the standard allocators for +large data). + +As a consequence, the result value of a function is generally passed as an +out-pointer parameter, rather than as a function return value. + +(however, functions which always determiniscally succeed may eschew this +convention and return their result directly) + +Here is an example of checking the outcome of an operation:: + + const int64_t buffer_size = 4096; + std::shared_ptr buffer; + + auto status = arrow::AllocateBuffer(buffer_size, &buffer); + if (!status.ok()) { + // ... handle error + } + +If the caller function itself returns a :class:`arrow::Status` and wants +to propagate any non-successful outcomes, a convenience macro +:cpp:func:`ARROW_RETURN_NON_OK` is available:: + + arrow::Status DoSomething() { + const int64_t buffer_size = 4096; + std::shared_ptr buffer; + ARROW_RETURN_NON_OK(arrow::AllocateBuffer(buffer_size, &buffer)); + // ... allocation successful, do something with buffer below + + // return success at the end + return Status::OK(); + } diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst new file mode 100644 index 0000000000000..117c05b8755e7 --- /dev/null +++ b/docs/source/cpp/datatypes.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Data Types +========== + +Data types govern how physical data is interpreted. Their :ref:`specification +` allows binary interoperability between different Arrow +implementations, including from different programming languages and runtimes +(for example it is possible to access the same data, without copying, from +both Python and Java using the :py:mod:`pyarrow.jvm` bridge module). + +Information about a data type in C++ can be represented in three ways: + +1. Using a :class:`arrow::DataType` instance (e.g. as a function argument) +2. Using a :class:`arrow::DataType` concrete subclass (e.g. as a template + parameter) +3. Using a :type:`arrow::Type::type` enum value (e.g. as the condition of + a switch statement) + +The first form (using a :class:`arrow::DataType` instance) is the most idiomatic +and flexible. Runtime-parametric types can only be fully represented with +a DataType instance. For example, a :class:`arrow::TimestampType` needs to be +constructed at runtime with a :type:`arrow::TimeUnit::type` parameter; a +:class:`arrow::Decimal128Type` with *scale* and *precision* parameters; +a :class:`arrow::ListType` with a full child type (itself a +:class:`arrow::DataType` instance). + +The two other forms can be used where performance is critical, in order to +avoid paying the price of dynamic typing and polymorphism. However, some +amount of runtime switching can still be required for parametric types. +It is not possible to reify all possible types at compile time, since Arrow +data types allows arbitrary nesting. + +Creating data types +------------------- + +To instantiate data types, it is recommended to call the provided +:ref:`factory functions `:: + + std::shared_ptr type; + + // A 16-bit integer type + type = arrow::int16(); + // A 64-bit timestamp type (with microsecond granularity) + type = arrow::timestamp(arrow::TimeUnit::MICRO); + // A list type of single-precision floating-point values + type = arrow::list(arrow::float32()); diff --git a/docs/source/cpp/examples.rst b/docs/source/cpp/examples.rst new file mode 100644 index 0000000000000..5f4372fbba2f2 --- /dev/null +++ b/docs/source/cpp/examples.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Examples +======== + +Row to columnar conversion +-------------------------- + +The following example converts an array of structs to a :class:`arrow::Table` +instance, and then converts it back to the original array of structs. + +.. literalinclude:: ../../../cpp/examples/arrow/row-wise-conversion-example.cc diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst new file mode 100644 index 0000000000000..7c55b76912d1b --- /dev/null +++ b/docs/source/cpp/getting_started.rst @@ -0,0 +1,31 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Getting Started +=============== + +.. toctree:: + + overview + conventions + memory + arrays + datatypes + tables diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 4f874bac4fd1e..1d70e6acbf0ce 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -20,69 +20,13 @@ C++ Implementation .. toctree:: :maxdepth: 2 - :caption: Getting Started + getting_started + examples api -Getting Started ---------------- +.. TODO add "topics" chapter +.. - nested arrays +.. - dictionary encoding -The most basic structure in Arrow is an :cpp:class:`arrow::Array`. It holds a sequence -of values with known length all having the same type. It consists of the data -itself and an additional bitmap that indicates if the corresponding entry of -array is a null-value. Note that for array with zero null entries, we can omit -this bitmap. - -As Arrow objects are immutable, there are classes provided that should help you -build these objects. To build an array of ``int64_t`` elements, we can use the -:cpp:class:`arrow::Int64Builder`. In the following example, we build an array of -the range 1 to 8 where the element that should hold the number 4 is nulled. - -.. code:: - - Int64Builder builder; - builder.Append(1); - builder.Append(2); - builder.Append(3); - builder.AppendNull(); - builder.Append(5); - builder.Append(6); - builder.Append(7); - builder.Append(8); - - std::shared_ptr array; - builder.Finish(&array); - -The resulting Array (which can be casted to :cpp:class:`arrow::Int64Array` if you want -to access its values) then consists of two :cpp:class:`arrow::Buffer`. The first one is -the null bitmap holding a single byte with the bits ``0|0|0|0|1|0|0|0``. -As we use `least-significant bit (LSB) numbering`_. -this indicates that the fourth entry in the array is null. The second -buffer is simply an ``int64_t`` array containing all the above values. -As the fourth entry is null, the value at that position in the buffer is -undefined. - -.. code:: - - // Cast the Array to its actual type to access its data - std::shared_ptr int64_array = std::static_pointer_cast(array); - - // Get the pointer to the null bitmap. - const uint8_t* null_bitmap = int64_array->null_bitmap_data(); - - // Get the pointer to the actual data - const int64_t* data = int64_array->raw_values(); - -In the above example, we have yet skipped explaining two things in the code. -On constructing the builder, we have passed :cpp:func:`arrow::int64()` to it. This is -the type information with which the resulting array will be annotated. In -this simple form, it is solely a :cpp:class:`std::shared_ptr` -instantiation. - -Furthermore, we have passed :cpp:func:`arrow::default_memory_pool()` to the constructor. -This :cpp:class:`arrow::MemoryPool` is used for the allocations of heap memory. Besides -tracking the amount of memory allocated, the allocator also ensures that the -allocated memory regions are 64-byte aligned (as required by the Arrow -specification). - -.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering +.. TODO add "building" or "development" chapter diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst new file mode 100644 index 0000000000000..23b4725e4b971 --- /dev/null +++ b/docs/source/cpp/memory.rst @@ -0,0 +1,127 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================= +Memory Management +================= + +Buffers +======= + +To avoid passing around raw data pointers with varying and non-obvious +lifetime rules, Arrow provides a generic abstraction called :class:`arrow::Buffer`. +A Buffer encapsulates a pointer and data size, and generally also ties its +lifetime to that of an underlying provider (in other words, a Buffer should +*always* point to valid memory till its destruction). Buffers are untyped: +they simply denote a physical memory area regardless of its intended meaning +or interpretation. + +Buffers may be allocated by Arrow itself , or by third-party routines. +For example, it is possible to pass the data of a Python bytestring as a Arrow +buffer, keeping the Python object alive as necessary. + +In addition, buffers come in various flavours: mutable or not, resizable or +not. Generally, you will hold a mutable buffer when building up a piece +of data, then it will be frozen as an immutable container such as an +:doc:`array `. + +.. note:: + Some buffers may point to non-CPU memory, such as GPU-backed memory + provided by a CUDA context. If you're writing a GPU-aware application, + you will need to be careful not to interpret a GPU memory pointer as + a CPU-reachable pointer, or vice-versa. + +Accessing Buffer Memory +----------------------- + +Buffers provide fast access to the underlying memory using the +:func:`~arrow::Buffer::size` and :func:`~arrow::Buffer::data` accessors +(or :func:`~arrow::Buffer::mutable_data` for writable access to a mutable +buffer). + +Slicing +------- + +It is possible to make zero-copy slices of buffers, to obtain a buffer +referring to some contiguous subset of the underlying data. This is done +by calling the :func:`arrow::SliceBuffer` and :func:`arrow::SliceMutableBuffer` +functions. + +Allocating a Buffer +------------------- + +You can allocate a buffer yourself by calling one of the +:func:`arrow::AllocateBuffer` or :func:`arrow::AllocateResizableBuffer` +overloads:: + + std::shared_ptr buffer; + + if (!arrow::AllocateBuffer(4096, &buffer).ok()) { + // ... handle allocation error + } + uint8_t* buffer_data = buffer->mutable_data(); + memcpy(buffer_data, "hello world", 11); + +Allocating a buffer this way ensures it is 64-bytes aligned and padded +as recommended by the :doc:`Arrow memory specification <../format/Layout>`. + +Building a Buffer +----------------- + +You can also allocate *and* build a Buffer incrementally, using the +:class:`arrow::BufferBuilder` API:: + + BufferBuilder builder; + builder.Resize(11); + builder.Append("hello ", 6); + builder.Append("world", 5); + + std::shared_ptr buffer; + if (!builder.Finish(&buffer).ok()) { + // ... handle buffer allocation error + } + +Memory Pools +============ + +When allocating a Buffer using the Arrow C++ API, the buffer's underlying +memory is allocated by a :class:`arrow::MemoryPool` instance. Usually this +will be the process-wide *default memory pool*, but many Arrow APIs allow +you to pass another MemoryPool instance for their internal allocations. + +Memory pools are used for large long-lived data such as array buffers. +Other data, such as small C++ objects and temporary workspaces, usually +goes through the regular C++ allocators. + +Default Memory Pool +------------------- + +Depending on how Arrow was compiled, the default memory pool may use the +standard C ``malloc`` allocator, or a `jemalloc `_ heap. + +STL Integration +--------------- + +If you wish to use a Arrow memory pool to allocate the data of STL containers, +you can do so using the :class:`arrow::stl_allocator` wrapper. + +Conversely, you can also use a STL allocator to allocate Arrow memory, +using the :class:`arrow::STLMemoryPool` class. However, this may be less +performant, as STL allocators don't provide a resizing operation. diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst new file mode 100644 index 0000000000000..490efc1b7a2c1 --- /dev/null +++ b/docs/source/cpp/overview.rst @@ -0,0 +1,93 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +High-Level Overview +=================== + +The Arrow C++ library is comprised of different parts, each of which serves +a specific purpose. + +The physical layer +------------------ + +**Memory management** abstractions provide a uniform API over memory that +may be allocated through various means, such as heap allocation, the memory +mapping of a file or a static memory area. In particular, the **buffer** +abstraction represents a contiguous area of physical data. + +The one-dimensional layer +------------------------- + +**Data types** govern the *logical* interpretation of *physical* data. +Many operations in Arrow are parametered, at compile-time or at runtime, +by a data type. + +**Arrays** assemble one or several buffers with a data type, allowing to +view them as a logical contiguous sequence of values (possibly nested). + +**Chunked arrays** are a generalization of arrays, comprising several same-type +arrays into a longer logical sequence of values. + +The two-dimensional layer +------------------------- + +**Schemas** describe a logical collection of several pieces of data, +each with a distinct name and type, and optional metadata. + +**Columns** are like chunked arrays, but with optional metadata. + +**Tables** are collections of columns in accordance to a schema. They are +the most capable dataset-providing abstraction in Arrow. + +**Record batches** are collections of contiguous arrays, described +by a schema. They allow incremental construction or serialization of tables. + +The compute layer +----------------- + +**Datums** are flexible dataset references, able to hold for example an array or table +reference. + +**Kernels** are specialized computation functions running in a loop over a +given set of datums representing input and output parameters to the functions. + +The IO layer +------------ + +**Streams** allow untyped sequential or seekable access over external data +of various kinds (for example compressed or memory-mapped). + +The Inter-Process Communication (IPC) layer +------------------------------------------- + +A **messaging format** allows interchange of Arrow data between processes, using +as few copies as possible. + +The file formats layer +---------------------- + +Reading and writing Arrow data from/to various file formats is possible, for +example **Parquet**, **CSV**, **Orc** or the Arrow-specific **Feather** format. + +The devices layer +----------------- + +Basic **CUDA** integration is provided, allowing to describe Arrow data backed +by GPU-allocated memory. diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst new file mode 100644 index 0000000000000..d42f0c6c4f53e --- /dev/null +++ b/docs/source/cpp/tables.rst @@ -0,0 +1,87 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +======================== +Two-dimensional Datasets +======================== + +While arrays and chunked arrays represent a one-dimensional sequence of +homogenous values, data often comes in the form of two-dimensional sets of +heterogenous data (such as database tables, CSV files...). Arrow provides +several abstractions to handle such data conveniently and efficiently. + +Fields +====== + +Fields are used to denote the particular columns of a table (and also +the particular members of a nested data type such as :class:`arrow::StructType`). +A field, i.e. an instance of :class:`arrow::Field`, holds together a data +type, a field name and some optional metadata. + +The recommended way to create a field is to call the :func:`arrow::field` +factory function. + +Schemas +======= + +A schema describes the overall structure of a two-dimensional dataset such +as a table. It holds a sequence of fields together with some optional +schema-wide metadata (in addition to per-field metadata). The recommended +way to create a schema is to call one the :func:`arrow::schema` factory +function overloads:: + + // Create a schema describing datasets with two columns: + // a int32 column "A" and a utf8-encoded string column "B" + std::shared_ptr field_a, field_b; + std::shared_ptr schema; + + field_a = arrow::field("A", arrow::int32()); + field_b = arrow::field("B", arrow::utf8()); + schema = arrow::schema({field_a, field_b}); + +Columns +======= + +A :class:`arrow::Column` is a chunked array tied together with a field. +The field describes the column's name (for lookup in a larger dataset) +and its metadata. + +Tables +====== + +A :class:`arrow::Table` is a two-dimensional dataset of a number of columns, +together with a schema. The columns' names and types must match the schema. +Also, each column must have the same logical length in number of elements +(although each column can be chunked in a different way). + +Record Batches +============== + +A :class:`arrow::RecordBatch` is a two-dimensional dataset of a number of +contiguous arrays, each the same length. Like a table, a record batch also +has a schema which must match its arrays' datatypes. + +Record batches are a convenient unit of work for various serialization +and computation functions, possibly incremental. + +A table can be streamed as an arbitrary number of record batches using +a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of +record batches can be assembled to form a table using one of the +:func:`arrow::Table::FromRecordBatches` factory function overloads. diff --git a/docs/source/format/IPC.rst b/docs/source/format/IPC.rst index 8cb74b87afcdc..62a1237436ae3 100644 --- a/docs/source/format/IPC.rst +++ b/docs/source/format/IPC.rst @@ -234,4 +234,28 @@ region) to be multiples of 64 bytes: :: +SparseTensor Message Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``SparseTensor`` message types provides another way to write a +multidimensional array of fixed-size values using Arrow's shared memory tools +in addition to ``Tensor``. ``SparseTensor`` is designed specifically for tensors +whose elements are almost zeros. Arrow implementations in general are not +required to implement this data format likewise ``Tensor``. + +When writing a standalone encapsulated sparse tensor message, we use the format as +indicated above, but additionally align the starting offset of the metadata as +well as the starting offsets of the sparse index and the sparse tensor body +(if writing to a shared memory region) to be multiples of 64 bytes: + + + + + + + + +The contents of the sparse tensor index is depends on what kinds of sparse +format is used. + .. _Flatbuffer: https://github.com/google/flatbuffers diff --git a/docs/source/format/Layout.rst b/docs/source/format/Layout.rst index 868a99b34f8d0..efe0594803a40 100644 --- a/docs/source/format/Layout.rst +++ b/docs/source/format/Layout.rst @@ -64,9 +64,11 @@ Base requirements data * It is required to have all the contiguous memory buffers in an IPC payload aligned at 8-byte boundaries. In other words, each buffer must start at - an aligned 8-byte offset. -* The general recommendation is to align the buffers at 64-byte boundary, but - this is not absolutely necessary. + an aligned 8-byte offset. Additionally, each buffer should be padded to a multiple + of 8 bytes. +* For performance reasons it **preferred/recommended** to align buffers to a + 64-byte boundary and pad to a multiple of 64 bytes, but this is not absolutely + necessary. The rationale is discussed in more details below. * Any relative type can have null slots * Arrays are immutable once created. Implementations can provide APIs to mutate an array, but applying mutations will require a new array data structure to @@ -122,14 +124,16 @@ practices for optimized memory access: * Elements in numeric arrays will be guaranteed to be retrieved via aligned access. * On some architectures alignment can help limit partially used cache lines. -* 64 byte alignment is recommended by the `Intel performance guide`_ for - data-structures over 64 bytes (which will be a common case for Arrow Arrays). -Recommending padding to a multiple of 64 bytes allows for using `SIMD`_ instructions +The recommendation for 64 byte alignment comes from the `Intel performance guide`_ +that recommends alignment of memory to match SIMD register width. +The specific padding length was chosen because it matches the largest known +SIMD instruction registers available as of April 2016 (Intel AVX-512). + +The recommended padding of 64 bytes allows for using `SIMD`_ instructions consistently in loops without additional conditional checks. This should allow for simpler, efficient and CPU cache-friendly code. -The specific padding length was chosen because it matches the largest known -SIMD instruction registers available as of April 2016 (Intel AVX-512). In other +In other words, we can load the entire 64-byte buffer into a 512-bit wide SIMD register and get data-level parallelism on all the columnar values packed into the 64-byte buffer. Guaranteed padding can also allow certain compilers @@ -162,9 +166,8 @@ Null bitmaps Any relative type can have null value slots, whether primitive or nested type. An array with nulls must have a contiguous memory buffer, known as the null (or -validity) bitmap, whose length is a multiple of 64 bytes (as discussed above) -and large enough to have at least 1 bit for each array -slot. +validity) bitmap, whose length is a multiple of 8 bytes (64 bytes recommended) +and large enough to have at least 1 bit for each array slot. Whether any array slot is valid (non-null) is encoded in the respective bits of this bitmap. A 1 (set bit) for index ``j`` indicates that the value is not null, @@ -614,13 +617,13 @@ Dictionary encoding ------------------- When a field is dictionary encoded, the values are represented by an array of -Int32 representing the index of the value in the dictionary. The Dictionary is -received as one or more DictionaryBatches with the id referenced by a -dictionary attribute defined in the metadata (Message.fbs) in the Field -table. The dictionary has the same layout as the type of the field would -dictate. Each entry in the dictionary can be accessed by its index in the -DictionaryBatches. When a Schema references a Dictionary id, it must send at -least one DictionaryBatch for this id. +signed integers representing the index of the value in the dictionary. +The Dictionary is received as one or more DictionaryBatches with the id +referenced by a dictionary attribute defined in the metadata (Message.fbs) +in the Field table. The dictionary has the same layout as the type of the +field would dictate. Each entry in the dictionary can be accessed by its +index in the DictionaryBatches. When a Schema references a Dictionary id, +it must send at least one DictionaryBatch for this id. As an example, you could have the following data: :: @@ -640,16 +643,17 @@ As an example, you could have the following data: :: In dictionary-encoded form, this could appear as: :: data List (dictionary-encoded, dictionary id i) - indices: [0, 0, 0, 1, 1, 1, 0] + type: Int32 + values: + [0, 0, 0, 1, 1, 1, 0] dictionary i - - type: List - - [ - ['a', 'b'], - ['c', 'd', 'e'], - ] + type: List + values: + [ + ['a', 'b'], + ['c', 'd', 'e'], + ] References ---------- @@ -659,6 +663,6 @@ Apache Drill Documentation - `Value Vectors`_ .. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering .. _Intel performance guide: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors .. _Endianness: https://en.wikipedia.org/wiki/Endianness -.. _SIMD: https://software.intel.com/en-us/node/600110 +.. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates .. _Parquet: https://parquet.apache.org/documentation/latest/ .. _Value Vectors: https://drill.apache.org/docs/value-vectors/ diff --git a/docs/source/format/Metadata.rst b/docs/source/format/Metadata.rst index 4ed82e0078e2c..293d0113875a6 100644 --- a/docs/source/format/Metadata.rst +++ b/docs/source/format/Metadata.rst @@ -266,6 +266,8 @@ detail for each type below): :: buffer 10: field 5 offsets buffer 11: field 5 data +.. _spec-logical-types: + Logical types ------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index fa6c683d14ecb..2b367b33823a2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -40,3 +40,9 @@ messaging and interprocess communication. cpp/index python/index + +.. toctree:: + :maxdepth: 2 + :caption: Other Topics + + building diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 06863964978b3..b06509f7a5b19 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -15,373 +15,22 @@ .. specific language governing permissions and limitations .. under the License. -.. currentmodule:: pyarrow .. _api: ************* API Reference ************* -.. _api.types: - -Type and Schema Factory Functions ---------------------------------- - -.. autosummary:: - :toctree: generated/ - - null - bool_ - int8 - int16 - int32 - int64 - uint8 - uint16 - uint32 - uint64 - float16 - float32 - float64 - time32 - time64 - timestamp - date32 - date64 - binary - string - decimal128 - list_ - struct - dictionary - field - schema - from_numpy_dtype - -.. currentmodule:: pyarrow.types -.. _api.types.checking: - -Type checking functions ------------------------ - -.. autosummary:: - :toctree: generated/ - - is_boolean - is_integer - is_signed_integer - is_unsigned_integer - is_int8 - is_int16 - is_int32 - is_int64 - is_uint8 - is_uint16 - is_uint32 - is_uint64 - is_floating - is_float16 - is_float32 - is_float64 - is_decimal - is_list - is_struct - is_union - is_nested - is_temporal - is_timestamp - is_date - is_date32 - is_date64 - is_time - is_time32 - is_time64 - is_null - is_binary - is_unicode - is_string - is_fixed_size_binary - is_map - is_dictionary - -.. currentmodule:: pyarrow - -.. _api.value: - -Scalar Value Types ------------------- - -.. autosummary:: - :toctree: generated/ - - NA - Scalar - ArrayValue - BooleanValue - Int8Value - Int16Value - Int32Value - Int64Value - UInt8Value - UInt16Value - UInt32Value - UInt64Value - FloatValue - DoubleValue - ListValue - BinaryValue - StringValue - FixedSizeBinaryValue - Date32Value - Date64Value - TimestampValue - DecimalValue - -.. _api.array: - -.. currentmodule:: pyarrow - -Array Types ------------ - -.. autosummary:: - :toctree: generated/ - - array - Array - BooleanArray - DictionaryArray - FloatingPointArray - IntegerArray - Int8Array - Int16Array - Int32Array - Int64Array - NullArray - NumericArray - UInt8Array - UInt16Array - UInt32Array - UInt64Array - BinaryArray - FixedSizeBinaryArray - StringArray - Time32Array - Time64Array - Date32Array - Date64Array - TimestampArray - Decimal128Array - ListArray - -.. _api.table: - -.. currentmodule:: pyarrow - -Tables and Record Batches -------------------------- - -.. autosummary:: - :toctree: generated/ - - column - chunked_array - concat_tables - ChunkedArray - Column - RecordBatch - Table - -.. _api.tensor: - -Tensor type and Functions -------------------------- - -.. autosummary:: - :toctree: generated/ - - Tensor - -.. _api.io: - -In-Memory Buffers ------------------ - -.. autosummary:: - :toctree: generated/ - - allocate_buffer - compress - decompress - py_buffer - foreign_buffer - Buffer - ResizableBuffer - -Input / Output and Shared Memory --------------------------------- - -.. autosummary:: - :toctree: generated/ - - input_stream - output_stream - BufferReader - BufferOutputStream - FixedSizeBufferWriter - NativeFile - OSFile - MemoryMappedFile - CompressedInputStream - CompressedOutputStream - memory_map - create_memory_map - PythonFile - -File Systems ------------- - -.. autosummary:: - :toctree: generated/ - - hdfs.connect - LocalFileSystem - -.. class:: HadoopFileSystem - :noindex: - -.. _api.ipc: - -Serialization and IPC ---------------------- - -.. autosummary:: - :toctree: generated/ - - Message - MessageReader - RecordBatchFileReader - RecordBatchFileWriter - RecordBatchStreamReader - RecordBatchStreamWriter - open_file - open_stream - read_message - read_record_batch - get_record_batch_size - read_tensor - write_tensor - get_tensor_size - serialize - serialize_to - deserialize - deserialize_components - deserialize_from - read_serialized - SerializedPyObject - SerializationContext - -.. _api.feather: - -Feather Format -~~~~~~~~~~~~~~ - -.. currentmodule:: pyarrow.feather - -.. _api.memory_pool: - -.. autosummary:: - :toctree: generated/ - - read_feather - write_feather - -Memory Pools ------------- - -.. currentmodule:: pyarrow - -.. autosummary:: - :toctree: generated/ - - MemoryPool - default_memory_pool - total_allocated_bytes - set_memory_pool - log_memory_allocations - -.. _api.type_classes: - -.. currentmodule:: pyarrow - -Type Classes ------------- - -.. autosummary:: - :toctree: generated/ - - DataType - Field - Schema - -.. currentmodule:: pyarrow.plasma - -.. _api.plasma: - -In-Memory Object Store ----------------------- - -.. autosummary:: - :toctree: generated/ - - ObjectID - PlasmaClient - PlasmaBuffer - -.. currentmodule:: pyarrow.csv - -.. _api.csv: - -CSV Files ---------- - -.. autosummary:: - :toctree: generated/ - - ReadOptions - ParseOptions - ConvertOptions - read_csv - -.. currentmodule:: pyarrow.parquet - -.. _api.parquet: - -Apache Parquet --------------- - -.. autosummary:: - :toctree: generated/ - - ParquetDataset - ParquetFile - ParquetWriter - read_table - read_metadata - read_pandas - read_schema - write_metadata - write_table - write_to_dataset - -.. currentmodule:: pyarrow - -Using with C extensions ------------------------ - -.. autosummary:: - :toctree: generated/ - - get_include - get_libraries - get_library_dirs +.. toctree:: + :maxdepth: 2 + + api/datatypes + api/arrays + api/memory + api/files + api/tables + api/ipc + api/formats + api/plasma + api/cuda + api/misc diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst new file mode 100644 index 0000000000000..db45eeff0ca5a --- /dev/null +++ b/docs/source/python/api/arrays.rst @@ -0,0 +1,109 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.array: +.. currentmodule:: pyarrow + +Arrays and Scalars +================== + +Factory Function +---------------- + +This function is the main entry point to create an Arrow array from Python. + +.. autosummary:: + :toctree: ../generated/ + + array + +Array Types +----------- + +An array's Python class depends on its data type. Concrete array classes +may expose data type-specific methods or properties. + +.. autosummary:: + :toctree: ../generated/ + + Array + BooleanArray + FloatingPointArray + IntegerArray + Int8Array + Int16Array + Int32Array + Int64Array + NullArray + NumericArray + UInt8Array + UInt16Array + UInt32Array + UInt64Array + BinaryArray + StringArray + FixedSizeBinaryArray + Time32Array + Time64Array + Date32Array + Date64Array + TimestampArray + Decimal128Array + DictionaryArray + ListArray + StructArray + UnionArray + +.. _api.scalar: + +Array Scalars +------------- + +Indexing an array wraps the represented value in a scalar object whose +concrete type depends on the array data type. You shouldn't instantiate +any of those classes directly. + +.. autosummary:: + :toctree: ../generated/ + + NA + Scalar + ArrayValue + BooleanValue + Int8Value + Int16Value + Int32Value + Int64Value + UInt8Value + UInt16Value + UInt32Value + UInt64Value + FloatValue + DoubleValue + BinaryValue + StringValue + FixedSizeBinaryValue + Time32Value + Time64Value + Date32Value + Date64Value + TimestampValue + DecimalValue + DictionaryValue + ListValue + StructValue + UnionValue diff --git a/docs/source/python/api/cuda.rst b/docs/source/python/api/cuda.rst new file mode 100644 index 0000000000000..364f032403586 --- /dev/null +++ b/docs/source/python/api/cuda.rst @@ -0,0 +1,62 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.cuda + +CUDA Integration +================ + +.. ifconfig:: not cuda_enabled + + .. error:: + This documentation was built without CUDA enabled. The CUDA + API docs are not available. + +.. NOTE We still generate those API docs (with empty docstrings) +.. when CUDA is disabled and `pyarrow.cuda` mocked (see conf.py). +.. Otherwise we'd get autodoc warnings, see https://github.com/sphinx-doc/sphinx/issues/4770 + +CUDA Contexts +------------- + +.. autosummary:: + :toctree: ../generated/ + + Context + +CUDA Buffers +------------ + +.. autosummary:: + :toctree: ../generated/ + + CudaBuffer + new_host_buffer + HostBuffer + BufferReader + BufferWriter + +Serialization and IPC +--------------------- + +.. autosummary:: + :toctree: ../generated/ + + serialize_record_batch + read_record_batch + read_message + IpcMemHandle diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst new file mode 100644 index 0000000000000..5ad0204966337 --- /dev/null +++ b/docs/source/python/api/datatypes.rst @@ -0,0 +1,134 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.types: +.. currentmodule:: pyarrow + +Data Types and Schemas +====================== + +Factory Functions +----------------- + +These should be used to create Arrow data types and schemas. + +.. autosummary:: + :toctree: ../generated/ + + null + bool_ + int8 + int16 + int32 + int64 + uint8 + uint16 + uint32 + uint64 + float16 + float32 + float64 + time32 + time64 + timestamp + date32 + date64 + binary + string + utf8 + decimal128 + list_ + struct + dictionary + field + schema + from_numpy_dtype + +.. _api.type_classes: +.. currentmodule:: pyarrow + +Type Classes +------------ + +Do not instantiate these classes directly. Instead, call one of the factory +functions above. + +.. autosummary:: + :toctree: ../generated/ + + DataType + DictionaryType + ListType + StructType + UnionType + TimestampType + Time32Type + Time64Type + FixedSizeBinaryType + Decimal128Type + Field + Schema + +.. _api.types.checking: +.. currentmodule:: pyarrow.types + +Type Checking +------------- + +These functions are predicates to check whether a :class:`DataType` instance +represents a given data type (such as ``int32``) or general category +(such as "is a signed integer"). + +.. autosummary:: + :toctree: ../generated/ + + is_boolean + is_integer + is_signed_integer + is_unsigned_integer + is_int8 + is_int16 + is_int32 + is_int64 + is_uint8 + is_uint16 + is_uint32 + is_uint64 + is_floating + is_float16 + is_float32 + is_float64 + is_decimal + is_list + is_struct + is_union + is_nested + is_temporal + is_timestamp + is_date + is_date32 + is_date64 + is_time + is_time32 + is_time64 + is_null + is_binary + is_unicode + is_string + is_fixed_size_binary + is_map + is_dictionary diff --git a/docs/source/python/api/files.rst b/docs/source/python/api/files.rst new file mode 100644 index 0000000000000..106dfde8abffb --- /dev/null +++ b/docs/source/python/api/files.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +Streams and File Access +======================= + +.. _api.io: + +Factory Functions +----------------- + +These factory functions are the recommended way to create a Arrow stream. +They accept various kinds of sources, such as in-memory buffers or on-disk files. + +.. autosummary:: + :toctree: ../generated/ + + input_stream + output_stream + memory_map + create_memory_map + +Stream Classes +-------------- + +.. autosummary:: + :toctree: ../generated/ + + NativeFile + OSFile + PythonFile + BufferReader + BufferOutputStream + FixedSizeBufferWriter + MemoryMappedFile + CompressedInputStream + CompressedOutputStream + +File Systems +------------ + +.. autosummary:: + :toctree: ../generated/ + + hdfs.connect + LocalFileSystem + +.. class:: HadoopFileSystem + :noindex: diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst new file mode 100644 index 0000000000000..8de30ece93584 --- /dev/null +++ b/docs/source/python/api/formats.rst @@ -0,0 +1,70 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Tabular File Formats +==================== + +.. currentmodule:: pyarrow.csv + +.. _api.csv: + +CSV Files +--------- + +.. autosummary:: + :toctree: ../generated/ + + ReadOptions + ParseOptions + ConvertOptions + read_csv + +.. _api.feather: + +Feather Files +------------- + +.. currentmodule:: pyarrow.feather + +.. autosummary:: + :toctree: ../generated/ + + read_feather + write_feather + +.. currentmodule:: pyarrow + +.. _api.parquet: + +Parquet Files +------------- + +.. currentmodule:: pyarrow.parquet + +.. autosummary:: + :toctree: ../generated/ + + ParquetDataset + ParquetFile + ParquetWriter + read_table + read_metadata + read_pandas + read_schema + write_metadata + write_table + write_to_dataset diff --git a/docs/source/python/api/ipc.rst b/docs/source/python/api/ipc.rst new file mode 100644 index 0000000000000..bd14d30dcb274 --- /dev/null +++ b/docs/source/python/api/ipc.rst @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.ipc: + +Serialization and IPC +===================== + +Inter-Process Communication +--------------------------- + +.. autosummary:: + :toctree: ../generated/ + + ipc.open_file + ipc.open_stream + Message + MessageReader + RecordBatchFileReader + RecordBatchFileWriter + RecordBatchStreamReader + RecordBatchStreamWriter + read_message + read_record_batch + get_record_batch_size + read_tensor + write_tensor + get_tensor_size + +Serialization +------------- + +.. autosummary:: + :toctree: ../generated/ + + serialize + serialize_to + deserialize + deserialize_components + deserialize_from + read_serialized + SerializedPyObject + SerializationContext diff --git a/docs/source/python/api/memory.rst b/docs/source/python/api/memory.rst new file mode 100644 index 0000000000000..da9156fcad539 --- /dev/null +++ b/docs/source/python/api/memory.rst @@ -0,0 +1,68 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.memory: + +Buffers and Memory +================== + +In-Memory Buffers +----------------- + +Factory Functions +~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + allocate_buffer + py_buffer + foreign_buffer + +Classes +~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + Buffer + ResizableBuffer + +Miscellaneous +~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + compress + decompress + +.. _api.memory_pool: + +Memory Pools +------------ + +.. autosummary:: + :toctree: ../generated/ + + MemoryPool + default_memory_pool + total_allocated_bytes + set_memory_pool + log_memory_allocations diff --git a/docs/source/python/api/misc.rst b/docs/source/python/api/misc.rst new file mode 100644 index 0000000000000..c13b80620f154 --- /dev/null +++ b/docs/source/python/api/misc.rst @@ -0,0 +1,40 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +Miscellaneous +============= + +Multi-Threading +--------------- + +.. autosummary:: + :toctree: ../generated/ + + cpu_count + set_cpu_count + +Using with C extensions +----------------------- + +.. autosummary:: + :toctree: ../generated/ + + get_include + get_libraries + get_library_dirs diff --git a/docs/source/python/api/plasma.rst b/docs/source/python/api/plasma.rst new file mode 100644 index 0000000000000..8df9e4e21ac8b --- /dev/null +++ b/docs/source/python/api/plasma.rst @@ -0,0 +1,33 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.plasma + +.. _api.plasma: + +Plasma In-Memory Object Store +============================= + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + ObjectID + PlasmaClient + PlasmaBuffer diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst new file mode 100644 index 0000000000000..5a229d29fa60b --- /dev/null +++ b/docs/source/python/api/tables.rst @@ -0,0 +1,54 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.table: + +Tables and Tensors +================== + +Factory Functions +----------------- + +.. autosummary:: + :toctree: ../generated/ + + column + chunked_array + concat_tables + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + ChunkedArray + Column + RecordBatch + Table + +.. _api.tensor: + +Tensors +------- + +.. autosummary:: + :toctree: ../generated/ + + Tensor diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst new file mode 100644 index 0000000000000..12205c57355bb --- /dev/null +++ b/docs/source/python/benchmarks.rst @@ -0,0 +1,55 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Benchmarks +========== + +The ``pyarrow`` package comes with a suite of benchmarks meant to +run with `ASV`_. You'll need to install the ``asv`` package first +(``pip install asv`` or ``conda install -c conda-forge asv``). + +Running the benchmarks +---------------------- + +To run the benchmarks for a locally-built Arrow, run ``asv dev`` or +``asv run --python=same``. + +Running for arbitrary Git revisions +----------------------------------- + +ASV allows to store results and generate graphs of the benchmarks over +the project's evolution. You need to have the latest development version of ASV: + +.. code:: + + pip install git+https://github.com/airspeed-velocity/asv + +The build scripts assume that Conda's ``activate`` script is on the PATH +(the ``conda activate`` command unfortunately isn't available from +non-interactive scripts). + +Now you should be ready to run ``asv run`` or whatever other command +suits your needs. Note that this can be quite long, as each Arrow needs +to be rebuilt for each Git revision you're running the benchmarks for. + +Compatibility +------------- + +We only expect the benchmarking setup to work with Python 3.6 or later, +on a Unix-like system with bash. + +.. _asv: https://asv.readthedocs.org/ diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index f1bcea9e24795..17023b1610d48 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -86,3 +86,7 @@ overhead of reading CSV files. Performance options can be controlled through the :class:`ReadOptions` class. Multi-threaded reading is the default for highest performance, distributing the workload efficiently over all available cores. + +.. note:: + The number of threads to use concurrently is automatically inferred by Arrow + and can be inspected using the :func:`~pyarrow.cpu_count()` function. diff --git a/docs/source/python/cuda.rst b/docs/source/python/cuda.rst new file mode 100644 index 0000000000000..b0150c1c5c8a2 --- /dev/null +++ b/docs/source/python/cuda.rst @@ -0,0 +1,159 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.cuda + +CUDA Integration +================ + +Arrow is not limited to CPU buffers (located in the computer's main memory, +also named "host memory"). It also has provisions for accessing buffers +located on a CUDA-capable GPU device (in "device memory"). + +.. note:: + This functionality is optional and must have been enabled at build time. + If this is not done by your package manager, you might have to build Arrow + yourself. + +CUDA Contexts +------------- + +A CUDA context represents access to a particular CUDA-capable device. +For example, this is creating a CUDA context accessing CUDA device number 0:: + + >>> from pyarrow import cuda + >>> ctx = cuda.Context(0) + >>> + +CUDA Buffers +------------ + +A CUDA buffer can be created by copying data from host memory to the memory +of a CUDA device, using the :meth:`Context.buffer_from_data` method. +The source data can be any Python buffer-like object, including Arrow buffers:: + + >>> import numpy as np + >>> arr = np.arange(4, dtype=np.int32) + >>> arr.nbytes + 16 + >>> cuda_buf = ctx.buffer_from_data(arr) + >>> type(cuda_buf) + pyarrow._cuda.CudaBuffer + >>> cuda_buf.size # The buffer's size in bytes + 16 + >>> cuda_buf.address # The buffer's address in device memory + 30088364544 + >>> cuda_buf.context.device_number + 0 + +Conversely, you can copy back a CUDA buffer to device memory, getting a regular +CPU buffer:: + + >>> buf = cuda_buf.copy_to_host() + >>> type(buf) + pyarrow.lib.Buffer + >>> np.frombuffer(buf, dtype=np.int32) + array([0, 1, 2, 3], dtype=int32) + +.. warning:: + Many Arrow functions expect a CPU buffer but will not check the buffer's + actual type. You will get a crash if you pass a CUDA buffer to such a + function:: + + >>> pa.py_buffer(b"x" * 16).equals(cuda_buf) + Segmentation fault + +Numba Integration +----------------- + +There is not much you can do directly with Arrow CUDA buffers from Python, +but they support interoperation with `Numba `_, +a JIT compiler which can turn Python code into optimized CUDA kernels. + +Arrow to Numba +~~~~~~~~~~~~~~ + +First let's define a Numba CUDA kernel operating on an ``int32`` array. Here, +we will simply increment each array element (assuming the array is writable):: + + import numba.cuda + + @numba.cuda.jit + def increment_by_one(an_array): + pos = numba.cuda.grid(1) + if pos < an_array.size: + an_array[pos] += 1 + +Then we need to wrap our CUDA buffer into a Numba "device array" with the right +array metadata (shape, strides and datatype). This is necessary so that Numba +can identify the array's characteristics and compile the kernel with the +appropriate type declarations. + +In this case the metadata can simply be got from the original Numpy array. +Note the GPU data isn't copied, just pointed to:: + + >>> from numba.cuda.cudadrv.devicearray import DeviceNDArray + >>> device_arr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=cuda_buf.to_numba()) + +(ideally we could have defined an Arrow array in CPU memory, copied it to CUDA +memory without losing type information, and then invoked the Numba kernel on it +without constructing the DeviceNDArray by hand; this is not yet possible) + +Finally we can run the Numba CUDA kernel on the Numba device array (here +with a 16x16 grid size):: + + >>> increment_by_one[16, 16](device_arr) + +And the results can be checked by copying back the CUDA buffer to CPU memory:: + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + array([1, 2, 3, 4], dtype=int32) + +Numba to Arrow +~~~~~~~~~~~~~~ + +Conversely, a Numba-created device array can be viewed as an Arrow CUDA buffer, +using the :meth:`CudaBuffer.from_numba` factory method. + +For the sake of example, let's first create a Numba device array:: + + >>> arr = np.arange(10, 14, dtype=np.int32) + >>> arr + array([10, 11, 12, 13], dtype=int32) + >>> device_arr = numba.cuda.to_device(arr) + +Then we can create a CUDA buffer pointing the device array's memory. +We don't need to pass a CUDA context explicitly this time: the appropriate +CUDA context is automatically retrieved and adapted from the Numba object. + +:: + + >>> cuda_buf = cuda.CudaBuffer.from_numba(device_arr.gpu_data) + >>> cuda_buf.size + 16 + >>> cuda_buf.address + 30088364032 + >>> cuda_buf.context.device_number + 0 + +Of course, we can copy the CUDA buffer back to host memory:: + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + array([10, 11, 12, 13], dtype=int32) + +.. seealso:: + Documentation for Numba's `CUDA support `_. diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index e86a0be0d04a4..d85537110e48c 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -86,14 +86,9 @@ On Linux and OSX: --file arrow/ci/conda_env_python.yml \ python=3.6 -On Windows: + conda activate pyarrow-dev -.. code-block:: shell - - conda create -y -n pyarrow-dev -c conda-forge ^ - --file arrow\ci\conda_env_cpp.yml ^ - --file arrow\ci\conda_env_python.yml ^ - python=3.6 +For Windows, see the `Developing on Windows`_ section below. We need to set some environment variables to let Arrow's build system know about our build toolchain: @@ -101,14 +96,21 @@ about our build toolchain: .. code-block:: shell export ARROW_BUILD_TYPE=release - export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX + export BOOST_HOME=$CONDA_PREFIX Using pip ~~~~~~~~~ +.. warning:: + + If you installed Python using the Anaconda distribution or `Miniconda + `_, you cannot currently use ``virtualenv`` + to manage your development. Please follow the conda-based development + instructions instead. + On macOS, install all dependencies through Homebrew that are required for building Arrow C++: @@ -125,9 +127,13 @@ dependencies will be automatically built by Arrow's third-party toolchain. libboost-filesystem-dev \ libboost-system-dev \ libboost-regex-dev \ + python-dev \ + autoconf \ flex \ bison +If you are building Arrow for Python 3, install ``python3-dev`` instead of ``python-dev``. + On Arch Linux, you can get these dependencies via pacman. .. code-block:: shell @@ -185,6 +191,12 @@ Now build and install the Arrow C++ libraries: If you don't want to build and install the Plasma in-memory object store, you can omit the ``-DARROW_PLASMA=on`` flag. +Also, if multiple versions of Python are installed in your environment, +you may have to pass additional parameters to cmake so that +it can find the right executable, headers and libraries. +For example, specifying `-DPYTHON_EXECUTABLE=$VIRTUAL_ENV/bin/python` +(assuming that you're in virtualenv) enables cmake to choose +the python executable which you are using. .. note:: @@ -197,9 +209,10 @@ Now, build pyarrow: .. code-block:: shell - cd arrow/python + pushd arrow/python python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --inplace + popd If you did not build with plasma, you can omit ``--with-plasma``. @@ -227,6 +240,7 @@ libraries), one can set ``--bundle-arrow-cpp``: .. code-block:: shell + pip install wheel # if not installed python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --bundle-arrow-cpp bdist_wheel @@ -287,11 +301,11 @@ First, starting from fresh clones of Apache Arrow: .. code-block:: shell - conda create -y -q -n pyarrow-dev ^ - python=3.6 numpy six setuptools cython pandas pytest ^ - cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib ^ - gflags brotli lz4-c zstd -c conda-forge - activate pyarrow-dev + conda create -y -n pyarrow-dev -c conda-forge ^ + --file arrow\ci\conda_env_cpp.yml ^ + --file arrow\ci\conda_env_python.yml ^ + python=3.7 + conda activate pyarrow-dev Now, we build and install Arrow C++ libraries @@ -341,3 +355,8 @@ Getting ``python-test.exe`` to run is a bit tricky because your set PYTHONHOME=%CONDA_PREFIX% Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work. + +Building the Documentation +========================== + +See :ref:`building-docs` for instructions to build the HTML documentation. diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 56282192b170b..9f96771494c79 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -18,21 +18,22 @@ Python bindings =============== -The Arrow Python bindings have first-class integration with NumPy, pandas, and -built-in Python objects. They are based on the C++ implementation of Arrow. - This is the documentation of the Python API of Apache Arrow. For more details -on the format and other language bindings see the parent documentation. -Here will we only detail the usage of the Python API for Arrow and the leaf +on the Arrow format and other language bindings see the +:doc:`parent documentation <../index>`. + +The Arrow Python bindings (also named "PyArrow") have first-class integration +with NumPy, pandas, and built-in Python objects. They are based on the C++ +implementation of Arrow. + +Here will we detail the usage of the Python API for Arrow and the leaf libraries that add additional functionality such as reading Apache Parquet files into Arrow structures. .. toctree:: :maxdepth: 2 - :caption: Getting Started install - development memory data ipc @@ -42,7 +43,9 @@ files into Arrow structures. pandas csv parquet + cuda extending api + development getting_involved - + benchmarks diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index d07d9004d2632..8092b6ce6a0ef 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -15,8 +15,8 @@ .. specific language governing permissions and limitations .. under the License. -Install PyArrow -=============== +Installing PyArrow +================== Conda ----- diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 3f7e787cd0c2f..812d843b0df56 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -84,11 +84,11 @@ particular stream. Now we can do: Now ``buf`` contains the complete stream as an in-memory byte buffer. We can read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the -convenience function ``pyarrow.open_stream``: +convenience function ``pyarrow.ipc.open_stream``: .. ipython:: python - reader = pa.open_stream(buf) + reader = pa.ipc.open_stream(buf) reader.schema batches = [b for b in reader] @@ -125,11 +125,11 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as The difference between :class:`~pyarrow.RecordBatchFileReader` and :class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a ``seek`` method for random access. The stream reader only requires read -operations. We can also use the ``pyarrow.open_file`` method to open a file: +operations. We can also use the ``pyarrow.ipc.open_file`` method to open a file: .. ipython:: python - reader = pa.open_file(buf) + reader = pa.ipc.open_file(buf) Because we have access to the entire payload, we know the number of record batches in the file, and can read any at random: @@ -149,7 +149,7 @@ DataFrame output: .. ipython:: python - df = pa.open_file(buf).read_pandas() + df = pa.ipc.open_file(buf).read_pandas() df[:5] Arbitrary Object Serialization diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 1ee81e754da1c..ba66807b38a8e 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -35,8 +35,8 @@ Referencing and Allocating Memory pyarrow.Buffer -------------- -The :class:`~pyarrow.Buffer` object wraps the C++ ``arrow::Buffer`` type and is -the primary tool for memory management in Apache Arrow in C++. It permits +The :class:`Buffer` object wraps the C++ :cpp:class:`arrow::Buffer` type +which is the primary tool for memory management in Apache Arrow in C++. It permits higher-level array classes to safely interact with memory which they may or may not own. ``arrow::Buffer`` can be zero-copy sliced to permit Buffers to cheaply reference other Buffers, while preserving memory lifetime and clean @@ -46,8 +46,9 @@ There are many implementations of ``arrow::Buffer``, but they all provide a standard interface: a data pointer and length. This is similar to Python's built-in `buffer protocol` and ``memoryview`` objects. -A :class:`~pyarrow.Buffer` can be created from any Python object which -implements the buffer protocol. Let's consider a bytes object: +A :class:`Buffer` can be created from any Python object implementing +the buffer protocol by calling the :func:`py_buffer` function. Let's consider +a bytes object: .. ipython:: python @@ -61,18 +62,22 @@ implements the buffer protocol. Let's consider a bytes object: Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the ``data`` bytes object. -The Buffer's ``to_pybytes`` method can convert to a Python byte string: +External memory, under the form of a raw pointer and size, can also be +referenced using the :func:`foreign_buffer` function. + +Buffers can be used in circumstances where a Python buffer or memoryview is +required, and such conversions are zero-copy: .. ipython:: python - buf.to_pybytes() + memoryview(buf) -Buffers can be used in circumstances where a Python buffer or memoryview is -required, and such conversions are also zero-copy: +The Buffer's :meth:`~Buffer.to_pybytes` method converts the Buffer's data to a +Python bytestring (thus making a copy of the data): .. ipython:: python - memoryview(buf) + buf.to_pybytes() Memory Pools ------------ @@ -104,6 +109,9 @@ the buffer is garbaged-collected, all of the memory is freed: buf = None pa.total_allocated_bytes() +.. seealso:: + On-GPU buffers using Arrow's optional :doc:`CUDA integration `. + Input and Output ================ diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst index 16b4ff6926809..dbc5e77e83bff 100644 --- a/docs/source/python/pandas.rst +++ b/docs/source/python/pandas.rst @@ -29,6 +29,13 @@ to them. (such as a different type system, and support for null values) that this is a separate topic from :ref:`numpy_interop`. +To follow examples in this document, make sure to run: + +.. ipython:: python + + import pandas as pd + import pyarrow as pa + DataFrames ---------- @@ -120,5 +127,64 @@ Arrow -> pandas Conversion +-------------------------------------+--------------------------------------------------------+ | ``TIMESTAMP(unit=*)`` | ``pd.Timestamp`` (``np.datetime64[ns]``) | +-------------------------------------+--------------------------------------------------------+ -| ``DATE`` | ``pd.Timestamp`` (``np.datetime64[ns]``) | +| ``DATE`` | ``object``(with ``datetime.date`` objects) | +-------------------------------------+--------------------------------------------------------+ + +Categorical types +~~~~~~~~~~~~~~~~~ + +TODO + +Datetime (Timestamp) types +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +TODO + +Date types +~~~~~~~~~~ + +While dates can be handled using the ``datetime64[ns]`` type in +pandas, some systems work with object arrays of Python's built-in +``datetime.date`` object: + +.. ipython:: python + + from datetime import date + s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)]) + s + +When converting to an Arrow array, the ``date32`` type will be used by +default: + +.. ipython:: python + + arr = pa.array(s) + arr.type + arr[0] + +To use the 64-bit ``date64``, specify this explicitly: + +.. ipython:: python + + arr = pa.array(s, type='date64') + arr.type + +When converting back with ``to_pandas``, object arrays of +``datetime.date`` objects are returned: + +.. ipython:: python + + arr.to_pandas() + +If you want to use NumPy's ``datetime64`` dtype instead, pass +``date_as_object=False``: + +.. ipython:: python + + s2 = pd.Series(arr.to_pandas(date_as_object=False)) + s2.dtype + +Time types +~~~~~~~~~~ + +TODO diff --git a/docs/source/python/plasma.rst b/docs/source/python/plasma.rst index 09837cf6e9ef9..660c5fbba7918 100644 --- a/docs/source/python/plasma.rst +++ b/docs/source/python/plasma.rst @@ -60,7 +60,7 @@ socket name: .. code-block:: python import pyarrow.plasma as plasma - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma") If the following error occurs from running the above Python code, that means that either the socket given is incorrect, or the ``./plasma_store`` is @@ -68,7 +68,7 @@ not currently running. Check to see if the Plasma store is still running. .. code-block:: shell - >>> client = plasma.connect("/tmp/plasma", "", 0) + >>> client = plasma.connect("/tmp/plasma") Connection to socket failed for pathname /tmp/plasma Could not connect to socket /tmp/plasma @@ -179,7 +179,7 @@ the object buffer. # Create a different client. Note that this second client could be # created in the same or in a separate, concurrent Python session. - client2 = plasma.connect("/tmp/plasma", "", 0) + client2 = plasma.connect("/tmp/plasma") # Get the object in the second client. This blocks until the object has been sealed. object_id2 = plasma.ObjectID(20 * b"a") @@ -221,7 +221,7 @@ of the object info might change in the future): import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma") client.put("hello, world") # Sleep a little so we get different creation times @@ -452,7 +452,7 @@ You can test this with the following script: import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma") data = np.random.randn(100000000) tensor = pa.Tensor.from_numpy(data) diff --git a/format/Message.fbs b/format/Message.fbs index 830718139d88c..e14fdca8f155c 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -87,7 +87,7 @@ table DictionaryBatch { /// which may include experimental metadata types. For maximum compatibility, /// it is best to send data using RecordBatch union MessageHeader { - Schema, DictionaryBatch, RecordBatch, Tensor + Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor } table Message { @@ -96,4 +96,4 @@ table Message { bodyLength: long; } -root_type Message; \ No newline at end of file +root_type Message; diff --git a/format/README.md b/format/README.rst similarity index 98% rename from format/README.md rename to format/README.rst index 6da844549e640..0eaad49b7e394 100644 --- a/format/README.md +++ b/format/README.rst @@ -22,4 +22,4 @@ This folder contains binary protocol definitions for the Arrow columnar format and other parts of the project, like the Flight RPC framework. For documentation about the Arrow format, see the `docs/source/format` -directory. \ No newline at end of file +directory. diff --git a/format/Tensor.fbs b/format/Tensor.fbs index 18b614c3bde62..e77b353a0f33f 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -23,6 +23,9 @@ include "Schema.fbs"; namespace org.apache.arrow.flatbuf; +/// ---------------------------------------------------------------------- +/// Data structures for dense tensors + /// Shape data for a single axis in a tensor table TensorDim { /// Length of dimension @@ -48,3 +51,96 @@ table Tensor { } root_type Tensor; + +/// ---------------------------------------------------------------------- +/// EXPERIMENTAL: Data structures for sparse tensors + +/// Coodinate format of sparse tensor index. +table SparseTensorIndexCOO { + /// COO's index list are represented as a NxM matrix, + /// where N is the number of non-zero values, + /// and M is the number of dimensions of a sparse tensor. + /// indicesBuffer stores the location and size of this index matrix. + /// The type of index value is long, so the stride for the index matrix is unnecessary. + /// + /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: + /// + /// X[0, 1, 2, 0] := 1 + /// X[1, 1, 2, 3] := 2 + /// X[0, 2, 1, 0] := 3 + /// X[0, 1, 3, 0] := 4 + /// X[0, 1, 2, 1] := 5 + /// X[1, 2, 0, 4] := 6 + /// + /// In COO format, the index matrix of X is the following 4x6 matrix: + /// + /// [[0, 0, 0, 0, 1, 1], + /// [1, 1, 1, 2, 1, 2], + /// [2, 2, 3, 1, 2, 0], + /// [0, 1, 0, 0, 3, 4]] + /// + /// Note that the indices are sorted in lexcographical order. + indicesBuffer: Buffer; +} + +/// Compressed Sparse Row format, that is matrix-specific. +table SparseMatrixIndexCSR { + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from indptr[i] to indptr[i+1] in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// + /// The array of non-zero values in X is: + /// + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// + /// And the indptr of X is: + /// + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. + indptrBuffer: Buffer; + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. + indicesBuffer: Buffer; +} + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSR +} + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type; + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim]; + + /// The number of non-zero values in a sparse tensor. + non_zero_length: long; + + /// Sparse tensor index + sparseIndex: SparseTensorIndex; + + /// The location and size of the tensor's data + data: Buffer; +} + +root_type SparseTensor; diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index b188dcd68c729..ef37aef42f602 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -180,8 +180,8 @@ func init() { arrow.STRING: func(data *Data) Interface { return NewStringData(data) }, arrow.BINARY: func(data *Data) Interface { return NewBinaryData(data) }, arrow.FIXED_SIZE_BINARY: func(data *Data) Interface { return NewFixedSizeBinaryData(data) }, - arrow.DATE32: unsupportedArrayType, - arrow.DATE64: unsupportedArrayType, + arrow.DATE32: func(data *Data) Interface { return NewDate32Data(data) }, + arrow.DATE64: func(data *Data) Interface { return NewDate64Data(data) }, arrow.TIMESTAMP: func(data *Data) Interface { return NewTimestampData(data) }, arrow.TIME32: func(data *Data) Interface { return NewTime32Data(data) }, arrow.TIME64: func(data *Data) Interface { return NewTime64Data(data) }, diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 0b89b7e5817cc..850fb09b4a81a 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -38,7 +38,13 @@ func NewBinaryData(data *Data) *Binary { } // Value returns the slice at index i. This value should not be mutated. -func (a *Binary) Value(i int) []byte { return a.valueBytes[a.valueOffsets[i]:a.valueOffsets[i+1]] } +func (a *Binary) Value(i int) []byte { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + idx := a.array.data.offset + i + return a.valueBytes[a.valueOffsets[idx]:a.valueOffsets[idx+1]] +} // ValueString returns the string at index i without performing additional allocations. // The string is only valid for the lifetime of the Binary array. @@ -47,10 +53,32 @@ func (a *Binary) ValueString(i int) string { return *(*string)(unsafe.Pointer(&b)) } -func (a *Binary) ValueOffset(i int) int { return int(a.valueOffsets[i]) } -func (a *Binary) ValueLen(i int) int { return int(a.valueOffsets[i+1] - a.valueOffsets[i]) } -func (a *Binary) ValueOffsets() []int32 { return a.valueOffsets } -func (a *Binary) ValueBytes() []byte { return a.valueBytes } +func (a *Binary) ValueOffset(i int) int { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return int(a.valueOffsets[a.array.data.offset+i]) +} + +func (a *Binary) ValueLen(i int) int { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + beg := a.array.data.offset + i + return int(a.valueOffsets[beg+1] - a.valueOffsets[beg]) +} + +func (a *Binary) ValueOffsets() []int32 { + beg := a.array.data.offset + end := beg + a.array.data.length + 1 + return a.valueOffsets[beg:end] +} + +func (a *Binary) ValueBytes() []byte { + beg := a.array.data.offset + end := beg + a.array.data.length + return a.valueBytes[a.valueOffsets[beg]:a.valueOffsets[end]] +} func (a *Binary) setData(data *Data) { if len(data.buffers) != 3 { diff --git a/go/arrow/array/binary_test.go b/go/arrow/array/binary_test.go index 87d1b58c47d14..2af45dee60f76 100644 --- a/go/arrow/array/binary_test.go +++ b/go/arrow/array/binary_test.go @@ -17,6 +17,7 @@ package array import ( + "reflect" "testing" "github.com/stretchr/testify/assert" @@ -62,3 +63,345 @@ func TestBinary(t *testing.T) { b.Release() } + +func TestBinarySliceData(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "def", "g", "hijk", "lm", "n", "opq", "rs", "tu"} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + for _, v := range values { + b.AppendString(v) + } + + arr := b.NewArray().(*Binary) + defer arr.Release() + + if got, want := arr.Len(), len(values); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]string, arr.Len()) + + for i := range vs { + vs[i] = arr.ValueString(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + want []string + }{ + { + interval: [2]int64{0, 0}, + want: []string{}, + }, + { + interval: [2]int64{0, 5}, + want: []string{"a", "bc", "def", "g", "hijk"}, + }, + { + interval: [2]int64{0, 10}, + want: []string{"a", "bc", "def", "g", "hijk", "lm", "n", "opq", "rs", "tu"}, + }, + { + interval: [2]int64{5, 10}, + want: []string{"lm", "n", "opq", "rs", "tu"}, + }, + { + interval: [2]int64{10, 10}, + want: []string{}, + }, + { + interval: [2]int64{2, 7}, + want: []string{"def", "g", "hijk", "lm", "n"}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := NewSlice(arr, tc.interval[0], tc.interval[1]).(*Binary) + defer slice.Release() + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]string, slice.Len()) + + for i := range vs { + vs[i] = slice.ValueString(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBinarySliceDataWithNull(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + if got, want := arr.Len(), len(values); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 3; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]string, arr.Len()) + + for i := range vs { + vs[i] = arr.ValueString(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + nulls int + want []string + }{ + { + interval: [2]int64{0, 2}, + nulls: 0, + want: []string{"a", "bc"}, + }, + { + interval: [2]int64{0, 3}, + nulls: 1, + want: []string{"a", "bc", ""}, + }, + { + interval: [2]int64{0, 4}, + nulls: 2, + want: []string{"a", "bc", "", ""}, + }, + { + interval: [2]int64{4, 8}, + nulls: 0, + want: []string{"hijk", "lm", "", "opq"}, + }, + { + interval: [2]int64{2, 9}, + nulls: 3, + want: []string{"", "", "hijk", "lm", "", "opq", ""}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := NewSlice(arr, tc.interval[0], tc.interval[1]).(*Binary) + defer slice.Release() + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.NullN(), tc.nulls; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + vs := make([]string, slice.Len()) + + for i := range vs { + vs[i] = slice.ValueString(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBinarySliceOutOfBounds(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "def", "g", "hijk", "lm", "n", "opq", "rs", "tu"} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + for _, v := range values { + b.AppendString(v) + } + + arr := b.NewArray().(*Binary) + defer arr.Release() + + slice := NewSlice(arr, 3, 8).(*Binary) + defer slice.Release() + + tests := []struct { + index int + panic bool + }{ + { + index: -1, + panic: true, + }, + { + index: 5, + panic: true, + }, + { + index: 0, + panic: false, + }, + { + index: 4, + panic: false, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + var val string + + if tc.panic { + defer func() { + e := recover() + if e == nil { + t.Fatalf("this should have panicked, but did not; slice value %q", val) + } + if got, want := e.(string), "arrow/array: index out of range"; got != want { + t.Fatalf("invalid error. got=%q, want=%q", got, want) + } + }() + } else { + defer func() { + if e := recover(); e != nil { + t.Fatalf("unexpected panic: %v", e) + } + }() + } + + val = slice.ValueString(tc.index) + }) + } +} + +func TestBinaryValueOffset(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + offset := 3 + vs := values[2:9] + + for i, v := range vs { + assert.Equal(t, offset, slice.ValueOffset(i)) + offset += len(v) + } +} + +func TestBinaryValueLen(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + vs := values[2:9] + + for i, v := range vs { + assert.Equal(t, len(v), slice.ValueLen(i)) + } +} + +func TestBinaryValueOffsets(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + assert.Equal(t, []int32{0, 1, 3, 3, 3, 7, 9, 9, 12, 12, 14}, arr.ValueOffsets()) + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + assert.Equal(t, []int32{3, 3, 3, 7, 9, 9, 12, 12}, slice.ValueOffsets()) +} + +func TestBinaryValueBytes(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + assert.Equal(t, []byte{'a', 'b', 'c', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q', 't', 'u'}, arr.ValueBytes()) + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + assert.Equal(t, []byte{'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'}, slice.ValueBytes()) +} diff --git a/go/arrow/array/boolean.go b/go/arrow/array/boolean.go index 19a692345e357..68de951e0ce8c 100644 --- a/go/arrow/array/boolean.go +++ b/go/arrow/array/boolean.go @@ -45,7 +45,12 @@ func NewBooleanData(data *Data) *Boolean { return a } -func (a *Boolean) Value(i int) bool { return bitutil.BitIsSet(a.values, i) } +func (a *Boolean) Value(i int) bool { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return bitutil.BitIsSet(a.values, a.array.data.offset+i) +} func (a *Boolean) String() string { o := new(strings.Builder) diff --git a/go/arrow/array/boolean_test.go b/go/arrow/array/boolean_test.go new file mode 100644 index 0000000000000..e6f4b9bf2bc51 --- /dev/null +++ b/go/arrow/array/boolean_test.go @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package array_test + +import ( + "reflect" + "testing" + + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" +) + +func TestBooleanSliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + values := []bool{true, false, true, true, true, true, true, false, true, false} + + b := array.NewBooleanBuilder(pool) + defer b.Release() + + for _, v := range values { + b.Append(v) + } + + arr := b.NewArray().(*array.Boolean) + defer arr.Release() + + if got, want := arr.Len(), len(values); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, arr.Len()) + + for i := range vs { + vs[i] = arr.Value(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + want []bool + }{ + { + interval: [2]int64{0, 0}, + want: []bool{}, + }, + { + interval: [2]int64{10, 10}, + want: []bool{}, + }, + { + interval: [2]int64{0, 5}, + want: []bool{true, false, true, true, true}, + }, + { + interval: [2]int64{5, 10}, + want: []bool{true, true, false, true, false}, + }, + { + interval: [2]int64{2, 7}, + want: []bool{true, true, true, true, true}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := array.NewSlice(arr, tc.interval[0], tc.interval[1]).(*array.Boolean) + defer slice.Release() + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, slice.Len()) + + for i := range vs { + vs[i] = slice.Value(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBooleanSliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + values := []bool{true, false, true, false, false, false, true, false, true, false} + valids := []bool{true, false, true, true, true, true, true, false, true, true} + + b := array.NewBooleanBuilder(pool) + defer b.Release() + + b.AppendValues(values, valids) + + arr := b.NewArray().(*array.Boolean) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 2; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, arr.Len()) + + for i := range vs { + vs[i] = arr.Value(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + nulls int + want []bool + }{ + { + interval: [2]int64{2, 9}, + nulls: 1, + want: []bool{true, false, false, false, true, false, true}, + }, + { + interval: [2]int64{0, 7}, + nulls: 1, + want: []bool{true, false, true, false, false, false, true}, + }, + { + interval: [2]int64{1, 8}, + nulls: 2, + want: []bool{false, true, false, false, false, true, false}, + }, + { + interval: [2]int64{2, 7}, + nulls: 0, + want: []bool{true, false, false, false, true}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := array.NewSlice(arr, tc.interval[0], tc.interval[1]).(*array.Boolean) + defer slice.Release() + + if got, want := slice.NullN(), tc.nulls; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, slice.Len()) + + for i := range vs { + vs[i] = slice.Value(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBooleanSliceOutOfBounds(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + values := []bool{true, false, true, false, true, false, true, false, true, false} + + b := array.NewBooleanBuilder(pool) + defer b.Release() + + for _, v := range values { + b.Append(v) + } + + arr := b.NewArray().(*array.Boolean) + defer arr.Release() + + slice := array.NewSlice(arr, 3, 8).(*array.Boolean) + defer slice.Release() + + tests := []struct { + index int + panic bool + }{ + { + index: -1, + panic: true, + }, + { + index: 5, + panic: true, + }, + { + index: 0, + panic: false, + }, + { + index: 4, + panic: false, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + var val bool + + if tc.panic { + defer func() { + e := recover() + if e == nil { + t.Fatalf("this should have panicked, but did not; slice value %v", val) + } + if got, want := e.(string), "arrow/array: index out of range"; got != want { + t.Fatalf("invalid error. got=%q, want=%q", got, want) + } + }() + } else { + defer func() { + if e := recover(); e != nil { + t.Fatalf("unexpected panic: %v", e) + } + }() + } + + val = slice.Value(tc.index) + }) + } +} diff --git a/go/arrow/array/numeric.gen.go b/go/arrow/array/numeric.gen.go index 1f734c05127b4..1fb8257d940c4 100644 --- a/go/arrow/array/numeric.gen.go +++ b/go/arrow/array/numeric.gen.go @@ -609,3 +609,93 @@ func (a *Time64) setData(data *Data) { a.values = a.values[beg:end] } } + +// A type which represents an immutable sequence of arrow.Date32 values. +type Date32 struct { + array + values []arrow.Date32 +} + +func NewDate32Data(data *Data) *Date32 { + a := &Date32{} + a.refCount = 1 + a.setData(data) + return a +} + +func (a *Date32) Value(i int) arrow.Date32 { return a.values[i] } +func (a *Date32) Date32Values() []arrow.Date32 { return a.values } + +func (a *Date32) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i, v := range a.values { + if i > 0 { + fmt.Fprintf(o, " ") + } + switch { + case a.IsNull(i): + o.WriteString("(null)") + default: + fmt.Fprintf(o, "%v", v) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Date32) setData(data *Data) { + a.array.setData(data) + vals := data.buffers[1] + if vals != nil { + a.values = arrow.Date32Traits.CastFromBytes(vals.Bytes()) + beg := a.array.data.offset + end := beg + a.array.data.length + a.values = a.values[beg:end] + } +} + +// A type which represents an immutable sequence of arrow.Date64 values. +type Date64 struct { + array + values []arrow.Date64 +} + +func NewDate64Data(data *Data) *Date64 { + a := &Date64{} + a.refCount = 1 + a.setData(data) + return a +} + +func (a *Date64) Value(i int) arrow.Date64 { return a.values[i] } +func (a *Date64) Date64Values() []arrow.Date64 { return a.values } + +func (a *Date64) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i, v := range a.values { + if i > 0 { + fmt.Fprintf(o, " ") + } + switch { + case a.IsNull(i): + o.WriteString("(null)") + default: + fmt.Fprintf(o, "%v", v) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Date64) setData(data *Data) { + a.array.setData(data) + vals := data.buffers[1] + if vals != nil { + a.values = arrow.Date64Traits.CastFromBytes(vals.Bytes()) + beg := a.array.data.offset + end := beg + a.array.data.length + a.values = a.values[beg:end] + } +} diff --git a/go/arrow/array/numeric_test.go b/go/arrow/array/numeric_test.go index 9e8267a70de6c..fc7f04addbe0d 100644 --- a/go/arrow/array/numeric_test.go +++ b/go/arrow/array/numeric_test.go @@ -394,3 +394,223 @@ func TestTime64SliceDataWithNull(t *testing.T) { t.Fatalf("got=%v, want=%v", got, want) } } + +func TestNewDate32Data(t *testing.T) { + exp := []arrow.Date32{1, 2, 4, 8, 16} + + dtype := &arrow.Date32Type{} + ad := array.NewData( + dtype, len(exp), + []*memory.Buffer{nil, memory.NewBufferBytes(arrow.Date32Traits.CastToBytes(exp))}, + nil, 0, 0, + ) + fa := array.NewDate32Data(ad) + + assert.Equal(t, len(exp), fa.Len(), "unexpected Len()") + assert.Equal(t, exp, fa.Date32Values(), "unexpected Date32Values()") +} + +func TestDate32SliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 4 + ) + + var ( + vs = []arrow.Date32{1, 2, 3, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate32Builder(pool) + defer b.Release() + + for _, v := range vs { + b.Append(v) + } + + arr := b.NewArray().(*array.Date32) + defer arr.Release() + + if got, want := arr.Len(), len(vs); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date32) + defer slice.Release() + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date32Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestDate32SliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 5 + ) + + var ( + valids = []bool{true, true, true, false, true, true} + vs = []arrow.Date32{1, 2, 3, 0, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate32Builder(pool) + defer b.Release() + + b.AppendValues(vs, valids) + + arr := b.NewArray().(*array.Date32) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date32) + defer slice.Release() + + if got, want := slice.NullN(), 1; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date32Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestNewDate64Data(t *testing.T) { + exp := []arrow.Date64{1, 2, 4, 8, 16} + + dtype := &arrow.Date64Type{} + ad := array.NewData( + dtype, len(exp), + []*memory.Buffer{nil, memory.NewBufferBytes(arrow.Date64Traits.CastToBytes(exp))}, + nil, 0, 0, + ) + fa := array.NewDate64Data(ad) + + assert.Equal(t, len(exp), fa.Len(), "unexpected Len()") + assert.Equal(t, exp, fa.Date64Values(), "unexpected Date64Values()") +} + +func TestDate64SliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 4 + ) + + var ( + vs = []arrow.Date64{1, 2, 3, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate64Builder(pool) + defer b.Release() + + for _, v := range vs { + b.Append(v) + } + + arr := b.NewArray().(*array.Date64) + defer arr.Release() + + if got, want := arr.Len(), len(vs); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date64Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date64) + defer slice.Release() + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date64Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestDate64SliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 5 + ) + + var ( + valids = []bool{true, true, true, false, true, true} + vs = []arrow.Date64{1, 2, 3, 0, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate64Builder(pool) + defer b.Release() + + b.AppendValues(vs, valids) + + arr := b.NewArray().(*array.Date64) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date64Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date64) + defer slice.Release() + + if got, want := slice.NullN(), 1; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date64Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go index 3a7dc167f15aa..946c5ba74aaeb 100644 --- a/go/arrow/array/numericbuilder.gen.go +++ b/go/arrow/array/numericbuilder.gen.go @@ -1772,6 +1772,274 @@ func (b *Time64Builder) newData() (data *Data) { return } +type Date32Builder struct { + builder + + data *memory.Buffer + rawData []arrow.Date32 +} + +func NewDate32Builder(mem memory.Allocator) *Date32Builder { + return &Date32Builder{builder: builder{refCount: 1, mem: mem}} +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *Date32Builder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *Date32Builder) Append(v arrow.Date32) { + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *Date32Builder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *Date32Builder) UnsafeAppend(v arrow.Date32) { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + b.rawData[b.length] = v + b.length++ +} + +func (b *Date32Builder) UnsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +// AppendValues will append the values in the v slice. The valid slice determines which values +// in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, +// all values in v are appended and considered valid. +func (b *Date32Builder) AppendValues(v []arrow.Date32, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + b.Reserve(len(v)) + if len(v) > 0 { + arrow.Date32Traits.Copy(b.rawData[b.length:], v) + } + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *Date32Builder) init(capacity int) { + b.builder.init(capacity) + + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.Date32Traits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.Date32Traits.CastFromBytes(b.data.Bytes()) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *Date32Builder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *Date32Builder) Resize(n int) { + nBuilder := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nBuilder, b.init) + b.data.Resize(arrow.Date32Traits.BytesRequired(n)) + b.rawData = arrow.Date32Traits.CastFromBytes(b.data.Bytes()) + } +} + +// NewArray creates a Date32 array from the memory buffers used by the builder and resets the Date32Builder +// so it can be used to build a new array. +func (b *Date32Builder) NewArray() Interface { + return b.NewDate32Array() +} + +// NewDate32Array creates a Date32 array from the memory buffers used by the builder and resets the Date32Builder +// so it can be used to build a new array. +func (b *Date32Builder) NewDate32Array() (a *Date32) { + data := b.newData() + a = NewDate32Data(data) + data.Release() + return +} + +func (b *Date32Builder) newData() (data *Data) { + bytesRequired := arrow.Date32Traits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + data = NewData(arrow.PrimitiveTypes.Date32, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + + return +} + +type Date64Builder struct { + builder + + data *memory.Buffer + rawData []arrow.Date64 +} + +func NewDate64Builder(mem memory.Allocator) *Date64Builder { + return &Date64Builder{builder: builder{refCount: 1, mem: mem}} +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *Date64Builder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *Date64Builder) Append(v arrow.Date64) { + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *Date64Builder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *Date64Builder) UnsafeAppend(v arrow.Date64) { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + b.rawData[b.length] = v + b.length++ +} + +func (b *Date64Builder) UnsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +// AppendValues will append the values in the v slice. The valid slice determines which values +// in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, +// all values in v are appended and considered valid. +func (b *Date64Builder) AppendValues(v []arrow.Date64, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + b.Reserve(len(v)) + if len(v) > 0 { + arrow.Date64Traits.Copy(b.rawData[b.length:], v) + } + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *Date64Builder) init(capacity int) { + b.builder.init(capacity) + + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.Date64Traits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.Date64Traits.CastFromBytes(b.data.Bytes()) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *Date64Builder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *Date64Builder) Resize(n int) { + nBuilder := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nBuilder, b.init) + b.data.Resize(arrow.Date64Traits.BytesRequired(n)) + b.rawData = arrow.Date64Traits.CastFromBytes(b.data.Bytes()) + } +} + +// NewArray creates a Date64 array from the memory buffers used by the builder and resets the Date64Builder +// so it can be used to build a new array. +func (b *Date64Builder) NewArray() Interface { + return b.NewDate64Array() +} + +// NewDate64Array creates a Date64 array from the memory buffers used by the builder and resets the Date64Builder +// so it can be used to build a new array. +func (b *Date64Builder) NewDate64Array() (a *Date64) { + data := b.newData() + a = NewDate64Data(data) + data.Release() + return +} + +func (b *Date64Builder) newData() (data *Data) { + bytesRequired := arrow.Date64Traits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + data = NewData(arrow.PrimitiveTypes.Date64, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + + return +} + var ( _ Builder = (*Int64Builder)(nil) _ Builder = (*Uint64Builder)(nil) @@ -1786,4 +2054,6 @@ var ( _ Builder = (*TimestampBuilder)(nil) _ Builder = (*Time32Builder)(nil) _ Builder = (*Time64Builder)(nil) + _ Builder = (*Date32Builder)(nil) + _ Builder = (*Date64Builder)(nil) ) diff --git a/go/arrow/array/numericbuilder_test.go b/go/arrow/array/numericbuilder_test.go index 65f3c86c2ea35..3bb49a3af7310 100644 --- a/go/arrow/array/numericbuilder_test.go +++ b/go/arrow/array/numericbuilder_test.go @@ -362,3 +362,223 @@ func TestTime64Builder_Resize(t *testing.T) { ab.Release() } + +func TestNewDate32Builder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + ab.Append(1) + ab.Append(2) + ab.Append(3) + ab.AppendNull() + ab.Append(5) + ab.Append(6) + ab.AppendNull() + ab.Append(8) + ab.Append(9) + ab.Append(10) + + // check state of builder before NewDate32Array + assert.Equal(t, 10, ab.Len(), "unexpected Len()") + assert.Equal(t, 2, ab.NullN(), "unexpected NullN()") + + a := ab.NewDate32Array() + + // check state of builder after NewDate32Array + assert.Zero(t, ab.Len(), "unexpected ArrayBuilder.Len(), NewDate32Array did not reset state") + assert.Zero(t, ab.Cap(), "unexpected ArrayBuilder.Cap(), NewDate32Array did not reset state") + assert.Zero(t, ab.NullN(), "unexpected ArrayBuilder.NullN(), NewDate32Array did not reset state") + + // check state of array + assert.Equal(t, 2, a.NullN(), "unexpected null count") + assert.Equal(t, []arrow.Date32{1, 2, 3, 0, 5, 6, 0, 8, 9, 10}, a.Date32Values(), "unexpected Date32Values") + assert.Equal(t, []byte{0xb7}, a.NullBitmapBytes()[:1]) // 4 bytes due to minBuilderCapacity + assert.Len(t, a.Date32Values(), 10, "unexpected length of Date32Values") + + a.Release() + + ab.Append(7) + ab.Append(8) + + a = ab.NewDate32Array() + + assert.Equal(t, 0, a.NullN()) + assert.Equal(t, []arrow.Date32{7, 8}, a.Date32Values()) + assert.Len(t, a.Date32Values(), 2) + + a.Release() +} + +func TestDate32Builder_AppendValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + exp := []arrow.Date32{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate32Array() + assert.Equal(t, exp, a.Date32Values()) + + a.Release() + ab.Release() +} + +func TestDate32Builder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + exp := []arrow.Date32{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate32Array() + assert.Equal(t, exp, a.Date32Values()) + a.Release() + + a = ab.NewDate32Array() + assert.Zero(t, a.Len()) + a.Release() + + ab.Release() +} + +func TestDate32Builder_Resize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + assert.Equal(t, 0, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + ab.Reserve(63) + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + for i := 0; i < 63; i++ { + ab.Append(0) + } + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 63, ab.Len()) + + ab.Resize(5) + assert.Equal(t, 5, ab.Len()) + + ab.Resize(32) + assert.Equal(t, 5, ab.Len()) + + ab.Release() +} + +func TestNewDate64Builder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + ab.Append(1) + ab.Append(2) + ab.Append(3) + ab.AppendNull() + ab.Append(5) + ab.Append(6) + ab.AppendNull() + ab.Append(8) + ab.Append(9) + ab.Append(10) + + // check state of builder before NewDate64Array + assert.Equal(t, 10, ab.Len(), "unexpected Len()") + assert.Equal(t, 2, ab.NullN(), "unexpected NullN()") + + a := ab.NewDate64Array() + + // check state of builder after NewDate64Array + assert.Zero(t, ab.Len(), "unexpected ArrayBuilder.Len(), NewDate64Array did not reset state") + assert.Zero(t, ab.Cap(), "unexpected ArrayBuilder.Cap(), NewDate64Array did not reset state") + assert.Zero(t, ab.NullN(), "unexpected ArrayBuilder.NullN(), NewDate64Array did not reset state") + + // check state of array + assert.Equal(t, 2, a.NullN(), "unexpected null count") + assert.Equal(t, []arrow.Date64{1, 2, 3, 0, 5, 6, 0, 8, 9, 10}, a.Date64Values(), "unexpected Date64Values") + assert.Equal(t, []byte{0xb7}, a.NullBitmapBytes()[:1]) // 4 bytes due to minBuilderCapacity + assert.Len(t, a.Date64Values(), 10, "unexpected length of Date64Values") + + a.Release() + + ab.Append(7) + ab.Append(8) + + a = ab.NewDate64Array() + + assert.Equal(t, 0, a.NullN()) + assert.Equal(t, []arrow.Date64{7, 8}, a.Date64Values()) + assert.Len(t, a.Date64Values(), 2) + + a.Release() +} + +func TestDate64Builder_AppendValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + exp := []arrow.Date64{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate64Array() + assert.Equal(t, exp, a.Date64Values()) + + a.Release() + ab.Release() +} + +func TestDate64Builder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + exp := []arrow.Date64{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate64Array() + assert.Equal(t, exp, a.Date64Values()) + a.Release() + + a = ab.NewDate64Array() + assert.Zero(t, a.Len()) + a.Release() + + ab.Release() +} + +func TestDate64Builder_Resize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + assert.Equal(t, 0, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + ab.Reserve(63) + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + for i := 0; i < 63; i++ { + ab.Append(0) + } + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 63, ab.Len()) + + ab.Resize(5) + assert.Equal(t, 5, ab.Len()) + + ab.Resize(32) + assert.Equal(t, 5, ab.Len()) + + ab.Release() +} diff --git a/go/arrow/csv/csv_test.go b/go/arrow/csv/csv_test.go index aaafb37554b87..97f31cc209c27 100644 --- a/go/arrow/csv/csv_test.go +++ b/go/arrow/csv/csv_test.go @@ -20,8 +20,6 @@ import ( "bytes" "fmt" "io/ioutil" - "log" - "os" "testing" "github.com/apache/arrow/go/arrow" @@ -30,17 +28,24 @@ import ( ) func Example() { - f, err := os.Open("testdata/simple.csv") - if err != nil { - log.Fatal(err) - } - defer f.Close() + f := bytes.NewBufferString(`## a simple set of data: int64;float64;string +0;0;str-0 +1;1;str-1 +2;2;str-2 +3;3;str-3 +4;4;str-4 +5;5;str-5 +6;6;str-6 +7;7;str-7 +8;8;str-8 +9;9;str-9 +`) schema := arrow.NewSchema( []arrow.Field{ - arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, - arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, - arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, }, nil, ) @@ -90,17 +95,24 @@ func Example() { } func Example_withChunk() { - f, err := os.Open("testdata/simple.csv") - if err != nil { - log.Fatal(err) - } - defer f.Close() + f := bytes.NewBufferString(`## a simple set of data: int64;float64;string +0;0;str-0 +1;1;str-1 +2;2;str-2 +3;3;str-3 +4;4;str-4 +5;5;str-5 +6;6;str-6 +7;7;str-7 +8;8;str-8 +9;9;str-9 +`) schema := arrow.NewSchema( []arrow.Field{ - arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, - arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, - arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, }, nil, ) diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index 60cc98a4b97d9..444495058a591 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -37,6 +37,8 @@ type ( Time32 int32 Time64 int64 TimeUnit int + Date32 int32 + Date64 int64 ) const ( diff --git a/go/arrow/datatype_numeric.gen.go b/go/arrow/datatype_numeric.gen.go index 2ec4c4098a4a6..9b5dc835b1ea2 100644 --- a/go/arrow/datatype_numeric.gen.go +++ b/go/arrow/datatype_numeric.gen.go @@ -78,6 +78,18 @@ func (t *Float64Type) ID() Type { return FLOAT64 } func (t *Float64Type) Name() string { return "float64" } func (t *Float64Type) BitWidth() int { return 64 } +type Date32Type struct{} + +func (t *Date32Type) ID() Type { return DATE32 } +func (t *Date32Type) Name() string { return "date32" } +func (t *Date32Type) BitWidth() int { return 32 } + +type Date64Type struct{} + +func (t *Date64Type) ID() Type { return DATE64 } +func (t *Date64Type) Name() string { return "date64" } +func (t *Date64Type) BitWidth() int { return 64 } + var ( PrimitiveTypes = struct { Int8 DataType @@ -90,6 +102,8 @@ var ( Uint64 DataType Float32 DataType Float64 DataType + Date32 DataType + Date64 DataType }{ Int8: &Int8Type{}, @@ -102,5 +116,7 @@ var ( Uint64: &Uint64Type{}, Float32: &Float32Type{}, Float64: &Float64Type{}, + Date32: &Date32Type{}, + Date64: &Date64Type{}, } ) diff --git a/go/arrow/datatype_numeric.gen.go.tmpldata b/go/arrow/datatype_numeric.gen.go.tmpldata index 415b51b2e16bd..9badc6ee2b211 100644 --- a/go/arrow/datatype_numeric.gen.go.tmpldata +++ b/go/arrow/datatype_numeric.gen.go.tmpldata @@ -48,5 +48,15 @@ "Name": "Float64", "Type": "float64", "Size": 64 + }, + { + "Name": "Date32", + "Type": "date32", + "Size": 32 + }, + { + "Name": "Date64", + "Type": "date64", + "Size": 64 } ] diff --git a/go/arrow/numeric.tmpldata b/go/arrow/numeric.tmpldata index b9e976eea0534..45452ab4468c6 100644 --- a/go/arrow/numeric.tmpldata +++ b/go/arrow/numeric.tmpldata @@ -107,5 +107,23 @@ "Opt": { "Parametric": true } + }, + { + "Name": "Date32", + "name": "date32", + "Type": "Date32", + "QualifiedType": "arrow.Date32", + "InternalType": "int32", + "Default": "0", + "Size": "4" + }, + { + "Name": "Date64", + "name": "date64", + "Type": "Date64", + "QualifiedType": "arrow.Date64", + "InternalType": "int64", + "Default": "0", + "Size": "8" } -] \ No newline at end of file +] diff --git a/go/arrow/type_traits_numeric.gen.go b/go/arrow/type_traits_numeric.gen.go index 59ed13f541a53..14fafbc57659b 100644 --- a/go/arrow/type_traits_numeric.gen.go +++ b/go/arrow/type_traits_numeric.gen.go @@ -38,6 +38,8 @@ var ( TimestampTraits timestampTraits Time32Traits time32Traits Time64Traits time64Traits + Date32Traits date32Traits + Date64Traits date64Traits ) // Int64 traits @@ -663,3 +665,99 @@ func (time64Traits) CastToBytes(b []Time64) []byte { // Copy copies src to dst. func (time64Traits) Copy(dst, src []Time64) { copy(dst, src) } + +// Date32 traits + +const ( + // Date32SizeBytes specifies the number of bytes required to store a single Date32 in memory + Date32SizeBytes = int(unsafe.Sizeof(Date32(0))) +) + +type date32Traits struct{} + +// BytesRequired returns the number of bytes required to store n elements in memory. +func (date32Traits) BytesRequired(n int) int { return Date32SizeBytes * n } + +// PutValue +func (date32Traits) PutValue(b []byte, v Date32) { + binary.LittleEndian.PutUint32(b, uint32(v)) +} + +// CastFromBytes reinterprets the slice b to a slice of type Date32. +// +// NOTE: len(b) must be a multiple of Date32SizeBytes. +func (date32Traits) CastFromBytes(b []byte) []Date32 { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []Date32 + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / Date32SizeBytes + s.Cap = h.Cap / Date32SizeBytes + + return res +} + +// CastToBytes reinterprets the slice b to a slice of bytes. +func (date32Traits) CastToBytes(b []Date32) []byte { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []byte + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * Date32SizeBytes + s.Cap = h.Cap * Date32SizeBytes + + return res +} + +// Copy copies src to dst. +func (date32Traits) Copy(dst, src []Date32) { copy(dst, src) } + +// Date64 traits + +const ( + // Date64SizeBytes specifies the number of bytes required to store a single Date64 in memory + Date64SizeBytes = int(unsafe.Sizeof(Date64(0))) +) + +type date64Traits struct{} + +// BytesRequired returns the number of bytes required to store n elements in memory. +func (date64Traits) BytesRequired(n int) int { return Date64SizeBytes * n } + +// PutValue +func (date64Traits) PutValue(b []byte, v Date64) { + binary.LittleEndian.PutUint64(b, uint64(v)) +} + +// CastFromBytes reinterprets the slice b to a slice of type Date64. +// +// NOTE: len(b) must be a multiple of Date64SizeBytes. +func (date64Traits) CastFromBytes(b []byte) []Date64 { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []Date64 + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / Date64SizeBytes + s.Cap = h.Cap / Date64SizeBytes + + return res +} + +// CastToBytes reinterprets the slice b to a slice of bytes. +func (date64Traits) CastToBytes(b []Date64) []byte { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []byte + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * Date64SizeBytes + s.Cap = h.Cap * Date64SizeBytes + + return res +} + +// Copy copies src to dst. +func (date64Traits) Copy(dst, src []Date64) { copy(dst, src) } diff --git a/integration/dask/Dockerfile b/integration/dask/Dockerfile new file mode 100644 index 0000000000000..5e054c51c561e --- /dev/null +++ b/integration/dask/Dockerfile @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM arrow:python-3.6 + +# setup /etc/localtime +RUN DEBIAN_FRONTEND=noninteractive \ + apt-get install -y -q tzdata + +# install dask release from conda +RUN conda install -c conda-forge dask pytest=3 && \ + conda clean --all + +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/integration/dask/runtest.sh diff --git a/integration/dask/runtest.sh b/integration/dask/runtest.sh new file mode 100755 index 0000000000000..baf9ccf4455db --- /dev/null +++ b/integration/dask/runtest.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +# check that optional pyarrow modules are available +# because pytest would just skip the dask tests +python -c "import pyarrow.orc" +python -c "import pyarrow.parquet" + +# TODO(kszucs): the following tests are also uses pyarrow +# pytest -sv --pyargs dask.bytes.tests.test_s3 +# pytest -sv --pyargs dask.bytes.tests.test_hdfs +# pytest -sv --pyargs dask.bytes.tests.test_local + +# TODO(kszucs): re-enable it, for more see ARROW-3910 +# pytest -v --pyargs dask.dataframe.io.tests.test_orc +pytest -v --pyargs dask.dataframe.io.tests.test_parquet +pytest -v --pyargs dask.dataframe.tests.test_dataframe diff --git a/integration/hdfs/Dockerfile b/integration/hdfs/Dockerfile index a1d3e4eb0a598..4f72e254f06e3 100644 --- a/integration/hdfs/Dockerfile +++ b/integration/hdfs/Dockerfile @@ -15,63 +15,35 @@ # specific language governing permissions and limitations # under the License. -FROM gelog/hadoop +FROM arrow:python-3.6 -RUN apt-get update && \ - apt-get install -y \ - autoconf \ - automake \ - make \ - gcc \ - g++ \ - git \ - wget \ - pkg-config \ - ninja-build - -ENV CC=gcc \ - CXX=g++ \ - PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -# install dependencies -ARG PYTHON_VERSION=3.6 -ADD ci/docker_install_conda.sh \ - ci/conda_env_cpp.yml \ - ci/conda_env_python.yml \ - /arrow/ci/ -RUN arrow/ci/docker_install_conda.sh && \ - conda install -c conda-forge \ - --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_python.yml \ - python=$PYTHON_VERSION && \ - conda clean --all - -# installing in the previous step boost=1.60 and boost-cpp=1.67 gets installed, -# cmake finds 1.60 and parquet fails to compile -# installing it in a separate step, boost=1.60 and boost-cpp=1.64 gets -# installed, cmake finds 1.64 -# libhdfs3 needs to be pinned, see ARROW-1465 and ARROW-1445 +# installing libhdfs3, it needs to be pinned, see ARROW-1465 and ARROW-1445 RUN conda install -y -c conda-forge hdfs3 libhdfs3=2.2.31 && \ conda clean --all +# installing libhdfs (JNI) +ARG HADOOP_VERSION=2.6.5 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_HOME=/usr/local/hadoop \ + HADOOP_OPTS=-Djava.library.path=/usr/local/hadoop/lib/native \ + PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin +RUN apt-get update -y && \ + apt-get install -y openjdk-8-jdk && \ + wget -q -O hadoop-$HADOOP_VERSION.tar.gz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" && \ + tar -zxf /hadoop-$HADOOP_VERSION.tar.gz && \ + rm /hadoop-$HADOOP_VERSION.tar.gz && \ + mv /hadoop-$HADOOP_VERSION /usr/local/hadoop +ADD integration/hdfs/hdfs-site.xml $HADOOP_HOME/etc/hadoop/ + # build cpp with tests -ENV ARROW_HDFS=ON \ +ENV CC=gcc \ + CXX=g++ \ + ARROW_ORC=ON \ + ARROW_HDFS=ON \ ARROW_PYTHON=ON \ - ARROW_BUILD_TESTS=ON \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${HADOOP_HOME}/lib/native" -ADD ci/docker_build_cpp.sh /arrow/ci/ -ADD cpp /arrow/cpp -ADD format /arrow/format -ADD java/pom.xml /arrow/java/pom.xml -RUN arrow/ci/docker_build_cpp.sh - -# build python -ADD ci/docker_build_python.sh /arrow/ci/ -ADD python /arrow/python -RUN arrow/ci/docker_build_python.sh + ARROW_BUILD_TESTS=ON -# execute integration tests -ENV LIBHDFS3_CONF=/arrow/integration/hdfs/libhdfs3.xml -ADD integration /arrow/integration -CMD arrow/integration/hdfs/runtest.sh +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/integration/hdfs/runtest.sh diff --git a/integration/hdfs/hdfs-site.xml b/integration/hdfs/hdfs-site.xml new file mode 100644 index 0000000000000..a80b945a664b7 --- /dev/null +++ b/integration/hdfs/hdfs-site.xml @@ -0,0 +1,44 @@ + + + + + + + + + dfs.replication + 2 + + + dfs.datanode.data.dir + file:///data/dfs/data + + + dfs.namenode.name.dir + file:///data/dfs/name + + + dfs.namenode.checkpoint.dir + file:///data/dfs/namesecondary + + + dfs.namenode.datanode.registration.ip-hostname-check + false + + + dfs.default.replica + 1 + + diff --git a/integration/hdfs/libhdfs3.xml b/integration/hdfs/libhdfs3.xml deleted file mode 100644 index f929929b386da..0000000000000 --- a/integration/hdfs/libhdfs3.xml +++ /dev/null @@ -1,332 +0,0 @@ - - - - - - - - - - - - - - - rpc.client.timeout - 3600000 - - timeout interval of a RPC invocation in millisecond. default is 3600000. - - - - rpc.client.connect.tcpnodelay - true - - whether set socket TCP_NODELAY to true when connect to RPC server. default is true. - - - - - rpc.client.max.idle - 10000 - - the max idle time of a RPC connection in millisecond. default is 10000. - - - - - rpc.client.ping.interval - 10000 - - the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000. - - - - - rpc.client.connect.timeout - 600000 - - the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000. - - - - - rpc.client.connect.retry - 10 - - the max retry times if the RPC client fail to setup the connection to server. default is 10. - - - - - rpc.client.read.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000. - - - - - rpc.client.write.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000. - - - - - rpc.client.socket.linger.timeout - -1 - - set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1. - - - - - - dfs.client.read.shortcircuit - false - - whether reading block file bypass datanode if the block and the client are on the same node. default is true. - - - - - dfs.default.replica - 1 - - the default number of replica. default is 3. - - - - - dfs.prefetchsize - 10 - - the default number of blocks which information will be prefetched. default is 10. - - - - - dfs.client.failover.max.attempts - 15 - - if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15. - - - - - dfs.default.blocksize - 134217728 - - default block size. default is 134217728. - - - - - dfs.client.log.severity - INFO - - the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO. - - - - - - input.connect.timeout - 600000 - - the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000. - - - - - input.read.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000. - - - - - input.write.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000. - - - - - input.localread.default.buffersize - 2097152 - - number of bytes of the buffer which is used to hold the data from block file and verify checksum. - it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576. - - - - - input.localread.blockinfo.cachesize - 1000 - - the size of block file path information cache. default is 1000. - - - - - input.read.getblockinfo.retry - 3 - - the max retry times when the client fail to get block information from namenode. default is 3. - - - - - - output.replace-datanode-on-failure - false - - whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false. - - - - - output.default.chunksize - 512 - - the number of bytes of a chunk in pipeline. default is 512. - - - - - output.default.packetsize - 65536 - - the number of bytes of a packet in pipeline. default is 65536. - - - - - output.default.write.retry - 10 - - the max retry times when the client fail to setup the pipeline. default is 10. - - - - - output.connect.timeout - 600000 - - the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000. - - - - - output.read.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000. - - - - - output.write.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000. - - - - - output.packetpool.size - 1024 - - the max number of packets in a file's packet pool. default is 1024. - - - - - output.close.timeout - 900000 - - the timeout interval in millisecond when close an output stream. default is 900000. - - - - - dfs.domain.socket.path - /var/lib/hadoop-hdfs/dn_socket - - Optional. This is a path to a UNIX domain socket that will be used for - communication between the DataNode and local HDFS clients. - If the string "_PORT" is present in this path, it will be replaced by the - TCP port of the DataNode. - - - - - dfs.client.use.legacy.blockreader.local - false - - Legacy short-circuit reader implementation based on HDFS-2246 is used - if this configuration parameter is true. - This is for the platforms other than Linux - where the new implementation based on HDFS-347 is not available. - - - - diff --git a/integration/hdfs/runtest.sh b/integration/hdfs/runtest.sh index a90eb93645369..44afb4a1888a4 100755 --- a/integration/hdfs/runtest.sh +++ b/integration/hdfs/runtest.sh @@ -17,12 +17,21 @@ # specific language governing permissions and limitations # under the License. +# exit on any error set -e export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export LIBHDFS3_CONF=$HADOOP_CONF_DIR/hdfs-site.xml +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ +# execute cpp tests pushd /build/cpp - debug/io-hdfs-test + debug/arrow-io-hdfs-test popd -pytest -v --pyargs pyarrow +# cannot use --pyargs with custom arguments like --hdfs or --only-hdfs, because +# pytest ignores them, see https://github.com/pytest-dev/pytest/issues/3517 +export PYARROW_TEST_ONLY_HDFS=ON + +pytest -v --pyargs pyarrow.tests.test_hdfs diff --git a/integration/integration_test.py b/integration/integration_test.py index 8021aa643263e..0bced26f15acd 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -18,6 +18,7 @@ from collections import OrderedDict import argparse import binascii +import contextlib import glob import itertools import json @@ -26,7 +27,9 @@ import six import string import subprocess +import sys import tempfile +import traceback import uuid import errno @@ -893,8 +896,8 @@ def generate_dictionary_case(): dictionaries=[dict1, dict2]) -def get_generated_json_files(): - temp_dir = tempfile.mkdtemp() +def get_generated_json_files(tempdir=None): + tempdir = tempdir or tempfile.mkdtemp() def _temp_path(): return @@ -910,7 +913,7 @@ def _temp_path(): generated_paths = [] for file_obj in file_objs: - out_path = os.path.join(temp_dir, 'generated_' + + out_path = os.path.join(tempdir, 'generated_' + file_obj.name + '.json') file_obj.write(out_path) generated_paths.append(out_path) @@ -924,17 +927,36 @@ def _temp_path(): class IntegrationRunner(object): - def __init__(self, json_files, testers, debug=False): + def __init__(self, json_files, testers, tempdir=None, debug=False): self.json_files = json_files self.testers = testers - self.temp_dir = tempfile.mkdtemp() + self.temp_dir = tempdir or tempfile.mkdtemp() self.debug = debug def run(self): + failures = [] for producer, consumer in itertools.product( filter(lambda t: t.PRODUCER, self.testers), filter(lambda t: t.CONSUMER, self.testers)): - self._compare_implementations(producer, consumer) + try: + self._compare_implementations(producer, consumer) + except Exception: + traceback.print_exc() + failures.append((producer, consumer, sys.exc_info())) + return failures + + def run_flight(self): + failures = [] + servers = filter(lambda t: t.FLIGHT_SERVER, self.testers) + clients = filter(lambda t: (t.FLIGHT_CLIENT and t.CONSUMER), + self.testers) + for server, client in itertools.product(servers, clients): + try: + self._compare_flight_implementations(server, client) + except Exception: + traceback.print_exc() + failures.append((server, client, sys.exc_info())) + return failures def _compare_implementations(self, producer, consumer): print('##########################################################') @@ -950,10 +972,12 @@ def _compare_implementations(self, producer, consumer): name = os.path.splitext(os.path.basename(json_path))[0] + file_id = guid()[:8] + # Make the random access file print('-- Creating binary inputs') - producer_file_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.json_to_arrow') + producer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.json_as_file') producer.json_to_file(json_path, producer_file_path) # Validate the file @@ -961,20 +985,55 @@ def _compare_implementations(self, producer, consumer): consumer.validate(json_path, producer_file_path) print('-- Validating stream') - producer_stream_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.arrow_to_stream') - consumer_file_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.stream_to_arrow') + producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + + name + + '.producer_file_as_stream') + consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + + '.consumer_stream_as_file') producer.file_to_stream(producer_file_path, producer_stream_path) consumer.stream_to_file(producer_stream_path, consumer_file_path) consumer.validate(json_path, consumer_file_path) + def _compare_flight_implementations(self, producer, consumer): + print('##########################################################') + print( + '{0} serving, {1} requesting'.format(producer.name, consumer.name) + ) + print('##########################################################') + + for json_path in self.json_files: + print('==========================================================') + print('Testing file {0}'.format(json_path)) + print('==========================================================') + + name = os.path.splitext(os.path.basename(json_path))[0] + + file_id = guid()[:8] + + with producer.flight_server(): + # Have the client request the file + consumer_file_path = os.path.join( + self.temp_dir, + file_id + '_' + name + '.consumer_requested_file') + consumer.flight_request(producer.FLIGHT_PORT, + json_path, consumer_file_path) + + # Validate the file + print('-- Validating file') + consumer.validate(json_path, consumer_file_path) + + # TODO: also have the client upload the file + class Tester(object): PRODUCER = False CONSUMER = False + FLIGHT_SERVER = False + FLIGHT_CLIENT = False + FLIGHT_PORT = 31337 def __init__(self, debug=False): self.debug = debug @@ -991,10 +1050,20 @@ def file_to_stream(self, file_path, stream_path): def validate(self, json_path, arrow_path): raise NotImplementedError + def flight_server(self): + raise NotImplementedError + + def flight_request(self, port, json_path, arrow_path): + raise NotImplementedError + class JavaTester(Tester): PRODUCER = True CONSUMER = True + FLIGHT_SERVER = True + FLIGHT_CLIENT = True + + FLIGHT_PORT = 31338 _arrow_version = load_version_from_pom() ARROW_TOOLS_JAR = os.environ.get( @@ -1002,6 +1071,15 @@ class JavaTester(Tester): os.path.join(ARROW_HOME, 'java/tools/target/arrow-tools-{}-' 'jar-with-dependencies.jar'.format(_arrow_version))) + ARROW_FLIGHT_JAR = os.environ.get( + 'ARROW_FLIGHT_JAVA_INTEGRATION_JAR', + os.path.join(ARROW_HOME, + 'java/flight/target/arrow-flight-{}-' + 'jar-with-dependencies.jar'.format(_arrow_version))) + ARROW_FLIGHT_SERVER = ('org.apache.arrow.flight.example.integration.' + 'IntegrationTestServer') + ARROW_FLIGHT_CLIENT = ('org.apache.arrow.flight.example.integration.' + 'IntegrationTestClient') name = 'Java' @@ -1044,18 +1122,58 @@ def file_to_stream(self, file_path, stream_path): print(' '.join(cmd)) run_cmd(cmd) + def flight_request(self, port, json_path, arrow_path): + cmd = ['java', '-cp', self.ARROW_FLIGHT_JAR, + self.ARROW_FLIGHT_CLIENT, + '-port', str(port), + '-j', json_path, + '-a', arrow_path] + if self.debug: + print(' '.join(cmd)) + run_cmd(cmd) + + @contextlib.contextmanager + def flight_server(self): + cmd = ['java', '-cp', self.ARROW_FLIGHT_JAR, + self.ARROW_FLIGHT_SERVER, + '-port', str(self.FLIGHT_PORT)] + if self.debug: + print(' '.join(cmd)) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE) + try: + output = server.stdout.readline().decode() + if not output.startswith("Server listening on localhost"): + raise RuntimeError( + "Flight-Java server did not start properly, output: " + + output) + yield + finally: + server.terminate() + server.wait(5) + class CPPTester(Tester): PRODUCER = True CONSUMER = True + FLIGHT_SERVER = True + FLIGHT_CLIENT = True EXE_PATH = os.environ.get( 'ARROW_CPP_EXE_PATH', os.path.join(ARROW_HOME, 'cpp/build/debug')) - CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'json-integration-test') - STREAM_TO_FILE = os.path.join(EXE_PATH, 'stream-to-file') - FILE_TO_STREAM = os.path.join(EXE_PATH, 'file-to-stream') + CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'arrow-json-integration-test') + STREAM_TO_FILE = os.path.join(EXE_PATH, 'arrow-stream-to-file') + FILE_TO_STREAM = os.path.join(EXE_PATH, 'arrow-file-to-stream') + + FLIGHT_PORT = 31337 + + FLIGHT_SERVER_CMD = [ + os.path.join(EXE_PATH, 'flight-test-integration-server'), + "-port", str(FLIGHT_PORT)] + FLIGHT_CLIENT_CMD = [ + os.path.join(EXE_PATH, 'flight-test-integration-client'), + "-host", "localhost"] name = 'C++' @@ -1095,6 +1213,33 @@ def file_to_stream(self, file_path, stream_path): print(cmd) os.system(cmd) + @contextlib.contextmanager + def flight_server(self): + if self.debug: + print(' '.join(self.FLIGHT_SERVER_CMD)) + server = subprocess.Popen(self.FLIGHT_SERVER_CMD, + stdout=subprocess.PIPE) + try: + output = server.stdout.readline().decode() + if not output.startswith("Server listening on localhost"): + raise RuntimeError( + "Flight-C++ server did not start properly, output: " + + output) + yield + finally: + server.terminate() + server.wait(5) + + def flight_request(self, port, json_path, arrow_path): + cmd = self.FLIGHT_CLIENT_CMD + [ + '-port=' + str(port), + '-path=' + json_path, + '-output=' + arrow_path + ] + if self.debug: + print(' '.join(cmd)) + subprocess.run(cmd) + class JSTester(Tester): PRODUCER = True @@ -1118,7 +1263,7 @@ def _run(self, exe_cmd, arrow_path=None, json_path=None, if json_path is not None: cmd.extend(['-j', json_path]) - cmd.extend(['--mode', command, '-t', 'es5', '-m', 'umd']) + cmd.extend(['--mode', command]) if self.debug: print(' '.join(cmd)) @@ -1162,17 +1307,32 @@ def get_static_json_files(): return glob.glob(glob_pattern) -def run_all_tests(debug=False): +def run_all_tests(run_flight=False, debug=False, tempdir=None): testers = [CPPTester(debug=debug), JavaTester(debug=debug), JSTester(debug=debug)] static_json_files = get_static_json_files() - generated_json_files = get_generated_json_files() + generated_json_files = get_generated_json_files(tempdir=tempdir) json_files = static_json_files + generated_json_files - runner = IntegrationRunner(json_files, testers, debug=debug) - runner.run() - print('-- All tests passed!') + runner = IntegrationRunner(json_files, testers, + tempdir=tempdir, debug=debug) + failures = [] + failures.extend(runner.run()) + if run_flight: + failures.extend(runner.run_flight()) + + print() + print('##########################################################') + if not failures: + print('-- All tests passed!') + else: + print('-- Tests completed, failures:') + for producer, consumer, exc_info in failures: + print("FAILED TEST:", producer.name, "producing, ", + consumer.name, "consuming") + traceback.print_exception(*exc_info) + print() def write_js_test_json(directory): @@ -1192,9 +1352,16 @@ def write_js_test_json(directory): parser.add_argument('--write_generated_json', dest='generated_json_path', action='store', default=False, help='Generate test JSON') + parser.add_argument('--run_flight', dest='run_flight', + action='store_true', default=False, + help='Run Flight integration tests') parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='Run executables in debug mode as relevant') + parser.add_argument('--tempdir', dest='tempdir', + default=tempfile.mkdtemp(), + help=('Directory to use for writing ' + 'integration test temporary files')) args = parser.parse_args() if args.generated_json_path: try: @@ -1204,4 +1371,5 @@ def write_js_test_json(directory): raise write_js_test_json(args.generated_json_path) else: - run_all_tests(debug=args.debug) + run_all_tests(run_flight=args.run_flight, + debug=args.debug, tempdir=args.tempdir) diff --git a/integration/spark/2.4.0.patch b/integration/spark/2.4.0.patch new file mode 100644 index 0000000000000..7992e010f63b9 --- /dev/null +++ b/integration/spark/2.4.0.patch @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/pom.xml b/pom.xml +index f0e5ed9c56..b30d4d61d6 100644 +--- a/pom.xml ++++ b/pom.xml +@@ -2092,0 +2093,2 @@ ++ -Xmax-classfile-name ++ 128 diff --git a/integration/spark/Dockerfile b/integration/spark/Dockerfile new file mode 100644 index 0000000000000..5c28cca0db447 --- /dev/null +++ b/integration/spark/Dockerfile @@ -0,0 +1,57 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM arrow:python-3.6 + +# installing java and maven +ARG MAVEN_VERSION=3.5.4 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + MAVEN_HOME=/usr/local/maven \ + M2_HOME=/root/.m2 \ + PATH=/root/.m2/bin:/usr/local/maven/bin:$PATH +RUN apt-get update -q -y && \ + apt-get install -q -y openjdk-8-jdk && \ + wget -q -O maven-$MAVEN_VERSION.tar.gz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=maven/maven-3/$MAVEN_VERSION/binaries/apache-maven-$MAVEN_VERSION-bin.tar.gz" && \ + tar -zxf /maven-$MAVEN_VERSION.tar.gz && \ + rm /maven-$MAVEN_VERSION.tar.gz && \ + mv /apache-maven-$MAVEN_VERSION /usr/local/maven + +# installing specific version of spark +ARG SPARK_VERSION=2.4.0 +RUN mkdir /spark && \ + cd /spark && \ + wget -q -O spark.tar.gz https://github.com/apache/spark/archive/v$SPARK_VERSION.tar.gz && \ + tar -xzf spark.tar.gz && \ + rm spark.tar.gz + +# patching spark is required in order to: +# - avoid too long filenames error https://issues.apache.org/jira/browse/SPARK-4820 +ADD integration/spark/$SPARK_VERSION.patch /arrow/integration/spark/$SPARK_VERSION.patch +RUN cd /spark/spark-$SPARK_VERSION && \ + patch -p1 < /arrow/integration/spark/$SPARK_VERSION.patch + +# build cpp with tests +ENV CC=gcc \ + CXX=g++ \ + ARROW_PYTHON=ON \ + ARROW_HDFS=ON \ + ARROW_BUILD_TESTS=OFF + +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/ci/docker_build_java.sh && \ + arrow/integration/spark/runtest.sh diff --git a/integration/spark/runtest.sh b/integration/spark/runtest.sh new file mode 100755 index 0000000000000..173f69efb27b4 --- /dev/null +++ b/integration/spark/runtest.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# exit on any error +set -e + +SPARK_VERSION=${SPARK_VERSION:-2.4.0} + +# rsynced source directory to build java libs +arrow_src=/build/java/arrow + +pushd $arrow_src/java + ARROW_VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'` +popd + +MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=warn" + +# build Spark with Arrow +pushd /spark/spark-${SPARK_VERSION} + # update Spark pom with the Arrow version just installed and build Spark, need package phase for pyspark + echo "Building Spark with Arrow $ARROW_VERSION" + mvn -q versions:set-property -Dproperty=arrow.version -DnewVersion=$ARROW_VERSION + + build/mvn -DskipTests package -pl sql/core -pl assembly -am + + SPARK_SCALA_TESTS=( + "org.apache.spark.sql.execution.arrow" + "org.apache.spark.sql.execution.vectorized.ColumnarBatchSuite" + "org.apache.spark.sql.execution.vectorized.ArrowColumnVectorSuite") + + (echo "Testing Spark:"; IFS=$'\n'; echo "${SPARK_SCALA_TESTS[*]}") + + # TODO: should be able to only build spark-sql tests with adding "-pl sql/core" but not currently working + build/mvn -Dtest=none -DwildcardSuites=$(IFS=,; echo "${SPARK_SCALA_TESTS[*]}") test + + # Run pyarrow related Python tests only + echo "Testing PySpark:" + python/run-tests --modules pyspark-sql +popd diff --git a/java/Dockerfile b/java/Dockerfile index 4ef9d28775e5f..d722e3d7ab82c 100644 --- a/java/Dockerfile +++ b/java/Dockerfile @@ -17,6 +17,10 @@ FROM maven:3.5.2-jdk-8-slim -WORKDIR /arrow/java +# rsync is required to prevent the contamination of arrow directory +# (mounted from the host) +RUN apt-get update -y && apt-get install -y rsync -CMD mvn test +CMD arrow/ci/docker_build_java.sh && \ + cd /build/java/arrow/java && \ + mvn test diff --git a/java/README.md b/java/README.md index 5a5f4d2c4211a..c69ff88ffa260 100644 --- a/java/README.md +++ b/java/README.md @@ -64,17 +64,13 @@ Refer to `java/dev/checkstyle/checkstyle.xml for rule specifics. ## Test Logging Configuration When running tests, Arrow Java uses the Logback logger with SLF4J. By default, -Logback has a log level set to DEBUG. Besides setting this level -programmatically, it can also be configured with a file named either -"logback.xml" or "logback-test.xml" residing in the classpath. The file -location can also be specified in the Maven command line with the following -option `-Dlogback.configurationFile=file:`. A sample -logback.xml file is available in `java/dev` with a log level of ERROR. Arrow -Java can be built with this file using the following command run in the project -root directory: +it uses the logback.xml present in the corresponding module's src/test/resources +directory, which has the default log level set to INFO. +Arrow Java can be built with an alternate logback configuration file using the +following command run in the project root directory: ```bash -mvn -Dlogback.configurationFile=file:`pwd`/dev/logback.xml +mvn -Dlogback.configurationFile=file: ``` See [Logback Configuration][1] for more details. diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml index ab0b9b55a92f5..dc919502c5858 100644 --- a/java/adapter/jdbc/pom.xml +++ b/java/adapter/jdbc/pom.xml @@ -16,7 +16,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT ../../pom.xml diff --git a/java/dev/logback.xml b/java/adapter/jdbc/src/test/resources/logback.xml similarity index 84% rename from java/dev/logback.xml rename to java/adapter/jdbc/src/test/resources/logback.xml index 10d54806b184a..4c54d18a210ff 100644 --- a/java/dev/logback.xml +++ b/java/adapter/jdbc/src/test/resources/logback.xml @@ -1,3 +1,4 @@ + - - - + @@ -23,7 +20,9 @@ $ mvn -Dlogback.configurationFile=file:${ARROW_HOME}/java/dev/logback.xml test - + + - + + diff --git a/java/flight/pom.xml b/java/flight/pom.xml index c6de29f8d005d..48939df886fd4 100644 --- a/java/flight/pom.xml +++ b/java/flight/pom.xml @@ -11,7 +11,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT arrow-flight @@ -48,19 +48,16 @@ io.grpc grpc-netty ${dep.grpc.version} - provided io.grpc grpc-core ${dep.grpc.version} - provided io.grpc grpc-protobuf ${dep.grpc.version} - provided io.netty @@ -75,11 +72,15 @@ com.google.guava guava + + commons-cli + commons-cli + 1.4 + io.grpc grpc-stub ${dep.grpc.version} - provided com.google.protobuf @@ -103,6 +104,10 @@ org.slf4j slf4j-api + + javax.annotation + javax.annotation-api + @@ -214,13 +219,50 @@ analyze-only - + io.netty:netty-tcnative-boringssl-static:* + + org.codehaus.mojo + build-helper-maven-plugin + 1.9.1 + + + add-generated-sources-to-classpath + generate-sources + + add-source + + + + ${project.build.directory}/generated-sources/protobuf + + + + + + + maven-assembly-plugin + 3.0.0 + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + diff --git a/java/flight/src/main/java/org/apache/arrow/flight/ArrowMessage.java b/java/flight/src/main/java/org/apache/arrow/flight/ArrowMessage.java index 9764ff39a4a19..d2f7bb6c713b5 100644 --- a/java/flight/src/main/java/org/apache/arrow/flight/ArrowMessage.java +++ b/java/flight/src/main/java/org/apache/arrow/flight/ArrowMessage.java @@ -22,6 +22,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.arrow.flatbuf.Message; @@ -52,10 +54,12 @@ import io.grpc.MethodDescriptor.Marshaller; import io.grpc.internal.ReadableBuffer; import io.grpc.protobuf.ProtoUtils; + import io.netty.buffer.ArrowBuf; import io.netty.buffer.ByteBuf; import io.netty.buffer.ByteBufInputStream; import io.netty.buffer.CompositeByteBuf; +import io.netty.buffer.Unpooled; /** * The in-memory representation of FlightData used to manage a stream of Arrow messages. @@ -95,6 +99,18 @@ public static HeaderType getHeader(byte b) { } + // Pre-allocated buffers for padding serialized ArrowMessages. + private static List PADDING_BUFFERS = Arrays.asList( + null, + Unpooled.copiedBuffer(new byte[] { 0 }), + Unpooled.copiedBuffer(new byte[] { 0, 0 }), + Unpooled.copiedBuffer(new byte[] { 0, 0, 0 }), + Unpooled.copiedBuffer(new byte[] { 0, 0, 0, 0 }), + Unpooled.copiedBuffer(new byte[] { 0, 0, 0, 0, 0 }), + Unpooled.copiedBuffer(new byte[] { 0, 0, 0, 0, 0, 0 }), + Unpooled.copiedBuffer(new byte[] { 0, 0, 0, 0, 0, 0, 0 }) + ); + private final FlightDescriptor descriptor; private final Message message; private final List bufs; @@ -253,8 +269,17 @@ private InputStream asInputStream(BufferAllocator allocator) { cos.writeTag(FlightData.DATA_BODY_FIELD_NUMBER, WireFormat.WIRETYPE_LENGTH_DELIMITED); int size = 0; + List allBufs = new ArrayList<>(); for (ArrowBuf b : bufs) { + allBufs.add(b); size += b.readableBytes(); + // [ARROW-4213] These buffers must be aligned to an 8-byte boundary in order to be readable from C++. + if (b.readableBytes() % 8 != 0) { + int paddingBytes = 8 - (b.readableBytes() % 8); + assert paddingBytes > 0 && paddingBytes < 8; + size += paddingBytes; + allBufs.add(PADDING_BUFFERS.get(paddingBytes).retain()); + } } // rawvarint is used for length definition. cos.writeUInt32NoTag(size); @@ -263,7 +288,7 @@ private InputStream asInputStream(BufferAllocator allocator) { ArrowBuf initialBuf = allocator.buffer(baos.size()); initialBuf.writeBytes(baos.toByteArray()); final CompositeByteBuf bb = new CompositeByteBuf(allocator.getAsByteBufAllocator(), true, bufs.size() + 1, - ImmutableList.builder().add(initialBuf).addAll(bufs).build()); + ImmutableList.builder().add(initialBuf).addAll(allBufs).build()); final ByteBufInputStream is = new DrainableByteBufInputStream(bb); return is; } catch (Exception ex) { diff --git a/java/flight/src/main/java/org/apache/arrow/flight/FlightInfo.java b/java/flight/src/main/java/org/apache/arrow/flight/FlightInfo.java index 5e7aad178e70d..9accbbe434a10 100644 --- a/java/flight/src/main/java/org/apache/arrow/flight/FlightInfo.java +++ b/java/flight/src/main/java/org/apache/arrow/flight/FlightInfo.java @@ -17,13 +17,22 @@ package org.apache.arrow.flight; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.Channels; import java.util.List; import java.util.stream.Collectors; import org.apache.arrow.flight.impl.Flight; import org.apache.arrow.flight.impl.Flight.FlightGetInfo; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.WriteChannel; +import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.arrow.vector.types.pojo.Schema; +import com.fasterxml.jackson.databind.util.ByteBufferBackedInputStream; + import com.google.common.collect.ImmutableList; import com.google.protobuf.ByteString; @@ -45,8 +54,15 @@ public FlightInfo(Schema schema, FlightDescriptor descriptor, List 0 ? - Schema.deserialize(flightGetInfo.getSchema().asReadOnlyByteBuffer()) : new Schema(ImmutableList.of()); + try { + final ByteBuffer schemaBuf = flightGetInfo.getSchema().asReadOnlyByteBuffer(); + schema = flightGetInfo.getSchema().size() > 0 ? + MessageSerializer.deserializeSchema( + new ReadChannel(Channels.newChannel(new ByteBufferBackedInputStream(schemaBuf)))) + : new Schema(ImmutableList.of()); + } catch (IOException e) { + throw new RuntimeException(e); + } descriptor = new FlightDescriptor(flightGetInfo.getFlightDescriptor()); endpoints = flightGetInfo.getEndpointList().stream().map(t -> new FlightEndpoint(t)).collect(Collectors.toList()); bytes = flightGetInfo.getTotalBytes(); @@ -74,9 +90,16 @@ public List getEndpoints() { } FlightGetInfo toProtocol() { + // Encode schema in a Message payload + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + MessageSerializer.serialize(new WriteChannel(Channels.newChannel(baos)), schema); + } catch (IOException e) { + throw new RuntimeException(e); + } return Flight.FlightGetInfo.newBuilder() .addAllEndpoint(endpoints.stream().map(t -> t.toProtocol()).collect(Collectors.toList())) - .setSchema(ByteString.copyFrom(schema.toByteArray())) + .setSchema(ByteString.copyFrom(baos.toByteArray())) .setFlightDescriptor(descriptor.toProtocol()) .setTotalBytes(FlightInfo.this.bytes) .setTotalRecords(records) diff --git a/java/flight/src/main/java/org/apache/arrow/flight/FlightService.java b/java/flight/src/main/java/org/apache/arrow/flight/FlightService.java index 91499123134c3..389497e884d09 100644 --- a/java/flight/src/main/java/org/apache/arrow/flight/FlightService.java +++ b/java/flight/src/main/java/org/apache/arrow/flight/FlightService.java @@ -128,7 +128,8 @@ public boolean isCancelled() { @Override public void start(VectorSchemaRoot root) { responseObserver.onNext(new ArrowMessage(null, root.getSchema())); - unloader = new VectorUnloader(root, true, false); + // [ARROW-4213] We must align buffers to be compatible with other languages. + unloader = new VectorUnloader(root, true, true); } @Override diff --git a/java/flight/src/main/java/org/apache/arrow/flight/FlightStream.java b/java/flight/src/main/java/org/apache/arrow/flight/FlightStream.java index 5cba7ab47aa30..616b9cdc267a5 100644 --- a/java/flight/src/main/java/org/apache/arrow/flight/FlightStream.java +++ b/java/flight/src/main/java/org/apache/arrow/flight/FlightStream.java @@ -178,6 +178,7 @@ public void onNext(ArrowMessage msg) { public void onError(Throwable t) { ex = t; queue.add(DONE_EX); + root.setException(t); } @Override diff --git a/java/flight/src/main/java/org/apache/arrow/flight/example/integration/IntegrationTestClient.java b/java/flight/src/main/java/org/apache/arrow/flight/example/integration/IntegrationTestClient.java new file mode 100644 index 0000000000000..803a56c6c1afe --- /dev/null +++ b/java/flight/src/main/java/org/apache/arrow/flight/example/integration/IntegrationTestClient.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.example.integration; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.flight.FlightClient; +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightEndpoint; +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.Location; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; + +/** + * An Example Flight Server that provides access to the InMemoryStore. + */ +class IntegrationTestClient { + private static final org.slf4j.Logger LOGGER = org.slf4j.LoggerFactory.getLogger(IntegrationTestClient.class); + private final Options options; + + private IntegrationTestClient() { + options = new Options(); + options.addOption("a", "arrow", true, "arrow file"); + options.addOption("j", "json", true, "json file"); + options.addOption("host", true, "The host to connect to."); + options.addOption("port", true, "The port to connect to." ); + } + + public static void main(String[] args) { + try { + new IntegrationTestClient().run(args); + } catch (ParseException e) { + fatalError("Invalid parameters", e); + } catch (IOException e) { + fatalError("Error accessing files", e); + } + } + + static void fatalError(String message, Throwable e) { + System.err.println(message); + System.err.println(e.getMessage()); + LOGGER.error(message, e); + System.exit(1); + } + + private void run(String[] args) throws ParseException, IOException { + CommandLineParser parser = new DefaultParser(); + CommandLine cmd = parser.parse(options, args, false); + + String fileName = cmd.getOptionValue("arrow"); + if (fileName == null) { + throw new IllegalArgumentException("missing arrow file parameter"); + } + File arrowFile = new File(fileName); + if (arrowFile.exists()) { + throw new IllegalArgumentException("arrow file already exists: " + arrowFile.getAbsolutePath()); + } + + final String host = cmd.getOptionValue("host", "localhost"); + final int port = Integer.parseInt(cmd.getOptionValue("port", "31337")); + + final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + FlightClient client = new FlightClient(allocator, new Location(host, port)); + FlightInfo info = client.getInfo(FlightDescriptor.path(cmd.getOptionValue("json"))); + List endpoints = info.getEndpoints(); + if (endpoints.isEmpty()) { + throw new RuntimeException("No endpoints returned from Flight server."); + } + + FlightStream stream = client.getStream(info.getEndpoints().get(0).getTicket()); + try (VectorSchemaRoot root = stream.getRoot(); + FileOutputStream fileOutputStream = new FileOutputStream(arrowFile); + ArrowFileWriter arrowWriter = new ArrowFileWriter(root, new DictionaryProvider.MapDictionaryProvider(), + fileOutputStream.getChannel())) { + while (stream.next()) { + arrowWriter.writeBatch(); + } + } + } +} diff --git a/java/flight/src/main/java/org/apache/arrow/flight/example/integration/IntegrationTestServer.java b/java/flight/src/main/java/org/apache/arrow/flight/example/integration/IntegrationTestServer.java new file mode 100644 index 0000000000000..7b45e53a149be --- /dev/null +++ b/java/flight/src/main/java/org/apache/arrow/flight/example/integration/IntegrationTestServer.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.flight.example.integration; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.concurrent.Callable; + +import org.apache.arrow.flight.Action; +import org.apache.arrow.flight.ActionType; +import org.apache.arrow.flight.Criteria; +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightEndpoint; +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.FlightServer; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.Location; +import org.apache.arrow.flight.Result; +import org.apache.arrow.flight.Ticket; +import org.apache.arrow.flight.auth.ServerAuthHandler; +import org.apache.arrow.flight.impl.Flight; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.JsonFileReader; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; + +class IntegrationTestServer { + private final Options options; + + private IntegrationTestServer() { + options = new Options(); + options.addOption("port", true, "The port to serve on."); + } + + private void run(String[] args) throws Exception { + CommandLineParser parser = new DefaultParser(); + CommandLine cmd = parser.parse(options, args, false); + + final BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + final int port = Integer.parseInt(cmd.getOptionValue("port", "31337")); + try (final IntegrationFlightProducer producer = new IntegrationFlightProducer(allocator); + final FlightServer server = new FlightServer(allocator, port, producer, ServerAuthHandler.NO_OP)) { + server.start(); + // Print out message for integration test script + System.out.println("Server listening on localhost:" + server.getPort()); + while (true) { + Thread.sleep(30000); + } + } + } + + public static void main(String[] args) { + try { + new IntegrationTestServer().run(args); + } catch (ParseException e) { + IntegrationTestClient.fatalError("Error parsing arguments", e); + } catch (Exception e) { + IntegrationTestClient.fatalError("Runtime error", e); + } + } + + static class IntegrationFlightProducer implements FlightProducer, AutoCloseable { + private final BufferAllocator allocator; + + IntegrationFlightProducer(BufferAllocator allocator) { + this.allocator = allocator; + } + + @Override + public void close() { + allocator.close(); + } + + @Override + public void getStream(Ticket ticket, ServerStreamListener listener) { + String path = new String(ticket.getBytes(), StandardCharsets.UTF_8); + File inputFile = new File(path); + try (JsonFileReader reader = new JsonFileReader(inputFile, allocator)) { + Schema schema = reader.start(); + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + listener.start(root); + while (reader.read(root)) { + listener.putNext(); + } + listener.completed(); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void listFlights(Criteria criteria, StreamListener listener) { + listener.onCompleted(); + } + + @Override + public FlightInfo getFlightInfo(FlightDescriptor descriptor) { + if (descriptor.isCommand()) { + throw new UnsupportedOperationException("Commands not supported."); + } + if (descriptor.getPath().size() < 1) { + throw new IllegalArgumentException("Must provide a path."); + } + String path = descriptor.getPath().get(0); + File inputFile = new File(path); + try (JsonFileReader reader = new JsonFileReader(inputFile, allocator)) { + Schema schema = reader.start(); + return new FlightInfo(schema, descriptor, + Collections.singletonList(new FlightEndpoint(new Ticket(path.getBytes()), + new Location("localhost", 31338))), + 0, 0); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public Callable acceptPut(FlightStream flightStream) { + return null; + } + + @Override + public Result doAction(Action action) { + return null; + } + + @Override + public void listActions(StreamListener listener) { + listener.onCompleted(); + } + } +} diff --git a/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java b/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java index 6b23a40f29348..71c90d3a00d47 100644 --- a/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java +++ b/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import com.google.common.collect.ImmutableList; @@ -78,6 +79,7 @@ public void ensureIndependentSteams() throws Exception { /** * Make sure that a stream doesn't go faster than the consumer is consuming. */ + @Ignore @Test public void ensureWaitUntilProceed() throws Exception { // request some values. diff --git a/java/flight/src/test/resources/logback.xml b/java/flight/src/test/resources/logback.xml index e409e92b6fae1..444b2ed6d8392 100644 --- a/java/flight/src/test/resources/logback.xml +++ b/java/flight/src/test/resources/logback.xml @@ -11,6 +11,7 @@ language governing permissions and limitations under the License. --> + true @@ -24,4 +25,4 @@ - \ No newline at end of file + diff --git a/java/format/pom.xml b/java/format/pom.xml index d5ccd5ff08be8..2c3dc03acab1d 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -15,7 +15,7 @@ arrow-java-root org.apache.arrow - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT arrow-format diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 39752e2d36913..a0901530a612d 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -16,7 +16,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT org.apache.arrow.gandiva @@ -29,7 +29,7 @@ 2.5.0 18.0 true - ../../cpp/debug + ../../cpp/debug/debug @@ -133,7 +133,7 @@ - ${gandiva.cpp.build.dir} + ${gandiva.cpp.build.dir}/../src/gandiva irhelpers.bc diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java index 96788b39e08ec..46deee95fa717 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java @@ -17,8 +17,6 @@ package org.apache.arrow.gandiva.evaluator; -import org.apache.arrow.gandiva.exceptions.GandivaException; - /** * Used to construct gandiva configuration objects. */ @@ -26,16 +24,6 @@ public class ConfigurationBuilder { private String byteCodeFilePath = ""; - private static volatile long defaultConfiguration = 0L; - - /** - * Ctor - ensure that gandiva is loaded. - * @throws GandivaException - if library cannot be loaded. - */ - public ConfigurationBuilder() throws GandivaException { - JniWrapper.getInstance(); - } - public ConfigurationBuilder withByteCodeFilePath(final String byteCodeFilePath) { this.byteCodeFilePath = byteCodeFilePath; return this; @@ -45,26 +33,6 @@ public String getByteCodeFilePath() { return byteCodeFilePath; } - /** - * Get the default configuration to invoke gandiva. - * @return default configuration - * @throws GandivaException if unable to get native builder instance. - */ - static long getDefaultConfiguration() throws GandivaException { - if (defaultConfiguration == 0L) { - synchronized (ConfigurationBuilder.class) { - if (defaultConfiguration == 0L) { - String defaultByteCodeFilePath = JniWrapper.getInstance().getByteCodeFilePath(); - - defaultConfiguration = new ConfigurationBuilder() - .withByteCodeFilePath(defaultByteCodeFilePath) - .buildConfigInstance(); - } - } - } - return defaultConfiguration; - } - public native long buildConfigInstance(); public native void releaseConfigInstance(long configId); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java new file mode 100644 index 0000000000000..37dd0f61056b0 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; + +public class DecimalTypeUtil { + + public enum OperationType { + ADD, + SUBTRACT, + MULTIPLY, + DIVIDE, + MOD + } + + private static final int MIN_ADJUSTED_SCALE = 6; + /// The maximum precision representable by a 16-byte decimal + private static final int MAX_PRECISION = 38; + + public static Decimal getResultTypeForOperation(OperationType operation, Decimal operand1, Decimal + operand2) { + int s1 = operand1.getScale(); + int s2 = operand2.getScale(); + int p1 = operand1.getPrecision(); + int p2 = operand2.getPrecision(); + int resultScale = 0; + int resultPrecision = 0; + switch (operation) { + case ADD: + case SUBTRACT: + resultScale = Math.max(operand1.getScale(), operand2.getScale()); + resultPrecision = resultScale + Math.max(operand1.getPrecision() - operand1.getScale(), + operand2.getPrecision() - operand2.getScale()) + 1; + break; + case MULTIPLY: + resultScale = s1 + s2; + resultPrecision = p1 + p2 + 1; + break; + case DIVIDE: + resultScale = + Math.max(MIN_ADJUSTED_SCALE, operand1.getScale() + operand2.getPrecision() + 1); + resultPrecision = + operand1.getPrecision() - operand1.getScale() + operand2.getScale() + resultScale; + break; + case MOD: + resultScale = Math.max(operand1.getScale(), operand2.getScale()); + resultPrecision = Math.min(operand1.getPrecision() - operand1.getScale(), + operand2.getPrecision() - operand2.getScale()) + + resultScale; + break; + default: + throw new RuntimeException("Needs support"); + } + return adjustScaleIfNeeded(resultPrecision, resultScale); + } + + private static Decimal adjustScaleIfNeeded(int precision, int scale) { + if (precision > MAX_PRECISION) { + int minScale = Math.min(scale, MIN_ADJUSTED_SCALE); + int delta = precision - MAX_PRECISION; + precision = MAX_PRECISION; + scale = Math.max(scale - delta, minScale); + } + return new Decimal(precision, scale); + } + +} + diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 9c41c1942e9b3..b9986791850a7 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -70,7 +70,7 @@ public static ExpressionRegistry getInstance() throws GandivaException { synchronized (ExpressionRegistry.class) { if (INSTANCE == null) { // ensure library is setup. - JniWrapper.getInstance(); + JniLoader.getInstance(); Set typesFromGandiva = getSupportedTypesFromGandiva(); Set functionsFromGandiva = getSupportedFunctionsFromGandiva(); INSTANCE = new ExpressionRegistry(typesFromGandiva, functionsFromGandiva); @@ -173,10 +173,11 @@ private static ArrowType getArrowType(ExtGandivaType type) { BIT_WIDTH_64); case GandivaType.NONE_VALUE: return new ArrowType.Null(); + case GandivaType.DECIMAL_VALUE: + return new ArrowType.Decimal(0,0); case GandivaType.FIXED_SIZE_BINARY_VALUE: case GandivaType.MAP_VALUE: case GandivaType.INTERVAL_VALUE: - case GandivaType.DECIMAL_VALUE: case GandivaType.DICTIONARY_VALUE: case GandivaType.LIST_VALUE: case GandivaType.STRUCT_VALUE: diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java index 25904d3dc1d76..4e9abedadf0f5 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java @@ -43,11 +43,13 @@ public class Filter { private static final Logger logger = LoggerFactory.getLogger(Filter.class); + private final JniWrapper wrapper; private final long moduleId; private final Schema schema; private boolean closed; - private Filter(long moduleId, Schema schema) { + private Filter(JniWrapper wrapper, long moduleId, Schema schema) { + this.wrapper = wrapper; this.moduleId = moduleId; this.schema = schema; this.closed = false; @@ -63,7 +65,7 @@ private Filter(long moduleId, Schema schema) { * @return A native filter object that can be used to invoke on a RecordBatch */ public static Filter make(Schema schema, Condition condition) throws GandivaException { - return make(schema, condition, ConfigurationBuilder.getDefaultConfiguration()); + return make(schema, condition, JniLoader.getDefaultConfiguration()); } /** @@ -81,11 +83,11 @@ public static Filter make(Schema schema, Condition condition, long configuration // Invoke the JNI layer to create the LLVM module representing the filter. GandivaTypes.Condition conditionBuf = condition.toProtobuf(); GandivaTypes.Schema schemaBuf = ArrowTypeHelper.arrowSchemaToProtobuf(schema); - JniWrapper gandivaBridge = JniWrapper.getInstance(); - long moduleId = gandivaBridge.buildFilter(schemaBuf.toByteArray(), + JniWrapper wrapper = JniLoader.getInstance().getWrapper(); + long moduleId = wrapper.buildFilter(schemaBuf.toByteArray(), conditionBuf.toByteArray(), configurationId); - logger.info("Created module for the projector with id {}", moduleId); - return new Filter(moduleId, schema); + logger.debug("Created module for the filter with id {}", moduleId); + return new Filter(wrapper, moduleId, schema); } /** @@ -144,7 +146,7 @@ private void evaluate(int numRows, List buffers, List buf bufSizes[idx++] = bufLayout.getSize(); } - int numRecords = JniWrapper.getInstance().evaluateFilter(this.moduleId, numRows, + int numRecords = wrapper.evaluateFilter(this.moduleId, numRows, bufAddrs, bufSizes, selectionVector.getType().getNumber(), selectionVector.getBuffer().memoryAddress(), selectionVector.getBuffer().capacity()); @@ -161,7 +163,7 @@ public void close() throws GandivaException { return; } - JniWrapper.getInstance().closeFilter(this.moduleId); + wrapper.closeFilter(this.moduleId); this.closed = true; } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java new file mode 100644 index 0000000000000..ccb5307049460 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import static java.util.UUID.randomUUID; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; + +import org.apache.arrow.gandiva.exceptions.GandivaException; + +/** + * This class handles loading of the jni library, and acts as a bridge for the native functions. + */ +class JniLoader { + private static final String LIBRARY_NAME = "gandiva_jni"; + private static final String IRHELPERS_BC = "irhelpers.bc"; + + private static volatile JniLoader INSTANCE; + private static volatile long defaultConfiguration = 0L; + + private final String byteCodeFilePath; + private final JniWrapper wrapper; + + private JniLoader(String byteCodeFilePath) { + this.byteCodeFilePath = byteCodeFilePath; + this.wrapper = new JniWrapper(); + } + + static JniLoader getInstance() throws GandivaException { + if (INSTANCE == null) { + synchronized (JniLoader.class) { + if (INSTANCE == null) { + INSTANCE = setupInstance(); + } + } + } + return INSTANCE; + } + + private static JniLoader setupInstance() throws GandivaException { + try { + String tempDir = System.getProperty("java.io.tmpdir"); + loadGandivaLibraryFromJar(tempDir); + File byteCodeFile = moveFileFromJarToTemp(tempDir, IRHELPERS_BC); + return new JniLoader(byteCodeFile.getAbsolutePath()); + } catch (IOException ioException) { + throw new GandivaException("unable to create native instance", ioException); + } + } + + private static void loadGandivaLibraryFromJar(final String tmpDir) + throws IOException, GandivaException { + final String libraryToLoad = System.mapLibraryName(LIBRARY_NAME); + final File libraryFile = moveFileFromJarToTemp(tmpDir, libraryToLoad); + System.load(libraryFile.getAbsolutePath()); + } + + + private static File moveFileFromJarToTemp(final String tmpDir, String libraryToLoad) + throws IOException, GandivaException { + final File temp = setupFile(tmpDir, libraryToLoad); + try (final InputStream is = JniLoader.class.getClassLoader() + .getResourceAsStream(libraryToLoad)) { + if (is == null) { + throw new GandivaException(libraryToLoad + " was not found inside JAR."); + } else { + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + } + return temp; + } + + private static File setupFile(String tmpDir, String libraryToLoad) + throws IOException, GandivaException { + // accommodate multiple processes running with gandiva jar. + // length should be ok since uuid is only 36 characters. + final String randomizeFileName = libraryToLoad + randomUUID(); + final File temp = new File(tmpDir, randomizeFileName); + if (temp.exists() && !temp.delete()) { + throw new GandivaException("File: " + temp.getAbsolutePath() + + " already exists and cannot be removed."); + } + if (!temp.createNewFile()) { + throw new GandivaException("File: " + temp.getAbsolutePath() + + " could not be created."); + } + temp.deleteOnExit(); + return temp; + } + + /** + * Returns the byte code file path extracted from jar. + */ + public String getByteCodeFilePath() { + return byteCodeFilePath; + } + + /** + * Returns the jni wrapper. + */ + JniWrapper getWrapper() throws GandivaException { + return wrapper; + } + + /** + * Get the default configuration to invoke gandiva. + * @return default configuration + * @throws GandivaException if unable to get native builder instance. + */ + static long getDefaultConfiguration() throws GandivaException { + if (defaultConfiguration == 0L) { + synchronized (ConfigurationBuilder.class) { + if (defaultConfiguration == 0L) { + String defaultByteCodeFilePath = JniLoader.getInstance().getByteCodeFilePath(); + + defaultConfiguration = new ConfigurationBuilder() + .withByteCodeFilePath(defaultByteCodeFilePath) + .buildConfigInstance(); + } + } + } + return defaultConfiguration; + } +} diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java index eea42f6976ce4..f00b0fbb9151a 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java @@ -17,100 +17,15 @@ package org.apache.arrow.gandiva.evaluator; -import static java.util.UUID.randomUUID; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; - import org.apache.arrow.gandiva.exceptions.GandivaException; /** * This class is implemented in JNI. This provides the Java interface - * to invoke functions in JNI + * to invoke functions in JNI. + * This file is used to generated the .h files required for jni. Avoid all + * external dependencies in this file. */ -class JniWrapper { - private static final String LIBRARY_NAME = "gandiva_jni"; - private static final String IRHELPERS_BC = "irhelpers.bc"; - - private static volatile JniWrapper INSTANCE; - - private final String byteCodeFilePath; - - private JniWrapper(String byteCodeFilePath) { - this.byteCodeFilePath = byteCodeFilePath; - } - - static JniWrapper getInstance() throws GandivaException { - if (INSTANCE == null) { - synchronized (JniWrapper.class) { - if (INSTANCE == null) { - INSTANCE = setupInstance(); - } - } - } - return INSTANCE; - } - - private static JniWrapper setupInstance() throws GandivaException { - try { - String tempDir = System.getProperty("java.io.tmpdir"); - loadGandivaLibraryFromJar(tempDir); - File byteCodeFile = moveFileFromJarToTemp(tempDir, IRHELPERS_BC); - return new JniWrapper(byteCodeFile.getAbsolutePath()); - } catch (IOException ioException) { - throw new GandivaException("unable to create native instance", ioException); - } - } - - private static void loadGandivaLibraryFromJar(final String tmpDir) - throws IOException, GandivaException { - final String libraryToLoad = System.mapLibraryName(LIBRARY_NAME); - final File libraryFile = moveFileFromJarToTemp(tmpDir, libraryToLoad); - System.load(libraryFile.getAbsolutePath()); - } - - - private static File moveFileFromJarToTemp(final String tmpDir, String libraryToLoad) - throws IOException, GandivaException { - final File temp = setupFile(tmpDir, libraryToLoad); - try (final InputStream is = JniWrapper.class.getClassLoader() - .getResourceAsStream(libraryToLoad)) { - if (is == null) { - throw new GandivaException(libraryToLoad + " was not found inside JAR."); - } else { - Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); - } - } - return temp; - } - - private static File setupFile(String tmpDir, String libraryToLoad) - throws IOException, GandivaException { - // accommodate multiple processes running with gandiva jar. - // length should be ok since uuid is only 36 characters. - final String randomizeFileName = libraryToLoad + randomUUID(); - final File temp = new File(tmpDir, randomizeFileName); - if (temp.exists() && !temp.delete()) { - throw new GandivaException("File: " + temp.getAbsolutePath() + - " already exists and cannot be removed."); - } - if (!temp.createNewFile()) { - throw new GandivaException("File: " + temp.getAbsolutePath() + - " could not be created."); - } - temp.deleteOnExit(); - return temp; - } - - /** - * Returns the byte code file path extracted from jar. - */ - public String getByteCodeFilePath() { - return byteCodeFilePath; - } +public class JniWrapper { /** * Generates the projector module to evaluate the expressions with diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index d7578936b3d83..d13195ca4d391 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -46,12 +46,14 @@ public class Projector { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Projector.class); + private JniWrapper wrapper; private final long moduleId; private final Schema schema; private final int numExprs; private boolean closed; - private Projector(long moduleId, Schema schema, int numExprs) { + private Projector(JniWrapper wrapper, long moduleId, Schema schema, int numExprs) { + this.wrapper = wrapper; this.moduleId = moduleId; this.schema = schema; this.numExprs = numExprs; @@ -71,7 +73,7 @@ private Projector(long moduleId, Schema schema, int numExprs) { */ public static Projector make(Schema schema, List exprs) throws GandivaException { - return make(schema, exprs, ConfigurationBuilder.getDefaultConfiguration()); + return make(schema, exprs, JniLoader.getDefaultConfiguration()); } /** @@ -96,11 +98,11 @@ public static Projector make(Schema schema, List exprs, long // Invoke the JNI layer to create the LLVM module representing the expressions GandivaTypes.Schema schemaBuf = ArrowTypeHelper.arrowSchemaToProtobuf(schema); - JniWrapper gandivaBridge = JniWrapper.getInstance(); - long moduleId = gandivaBridge.buildProjector(schemaBuf.toByteArray(), builder.build() - .toByteArray(), configurationId); - logger.info("Created module for the projector with id {}", moduleId); - return new Projector(moduleId, schema, exprs.size()); + JniWrapper wrapper = JniLoader.getInstance().getWrapper(); + long moduleId = wrapper.buildProjector(schemaBuf.toByteArray(), + builder.build().toByteArray(), configurationId); + logger.debug("Created module for the projector with id {}", moduleId); + return new Projector(wrapper, moduleId, schema, exprs.size()); } /** @@ -175,9 +177,7 @@ private void evaluate(int numRows, List buffers, List buf valueVector.setValueCount(numRows); } - JniWrapper.getInstance().evaluateProjector(this.moduleId, numRows, - bufAddrs, bufSizes, - outAddrs, outSizes); + wrapper.evaluateProjector(this.moduleId, numRows, bufAddrs, bufSizes, outAddrs, outSizes); } /** @@ -188,7 +188,7 @@ public void close() throws GandivaException { return; } - JniWrapper.getInstance().closeProjector(this.moduleId); + wrapper.closeProjector(this.moduleId); this.closed = true; } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java new file mode 100644 index 0000000000000..1b908b9962fb3 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.expression; + +import java.nio.charset.Charset; + +import org.apache.arrow.gandiva.exceptions.GandivaException; +import org.apache.arrow.gandiva.ipc.GandivaTypes; + +import com.google.protobuf.ByteString; + + +/** + * Used to represent expression tree nodes representing decimal constants. + * Used in the expression (x + 5.0) + */ +class DecimalNode implements TreeNode { + private final String value; + private final int precision; + private final int scale; + + DecimalNode(String value, int precision, int scale) { + this.value = value; + this.precision = precision; + this.scale = scale; + } + + @Override + public GandivaTypes.TreeNode toProtobuf() throws GandivaException { + GandivaTypes.DecimalNode.Builder decimalNode = GandivaTypes.DecimalNode.newBuilder(); + decimalNode.setValue(value); + decimalNode.setPrecision(precision); + decimalNode.setScale(scale); + + GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder(); + builder.setDecimalNode(decimalNode.build()); + return builder.build(); + } +} diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index f5568591c2002..a220c547e44a6 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -55,6 +55,10 @@ public static TreeNode makeBinaryLiteral(byte[] binaryConstant) { return new BinaryNode(binaryConstant); } + public static TreeNode makeDecimalLiteral(String decimalConstant, int precision, int scale) { + return new DecimalNode(decimalConstant, precision, scale); + } + /** * create a null literal. */ diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java index aeb3d418a70ac..97c2883c58e5e 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java @@ -17,6 +17,8 @@ package org.apache.arrow.gandiva.evaluator; +import java.math.BigDecimal; +import java.math.BigInteger; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -27,6 +29,7 @@ import org.apache.arrow.gandiva.expression.ExpressionTree; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -229,6 +232,18 @@ ArrowBuf intBuf(int[] ints) { return buffer; } + DecimalVector decimalVector(String[] values, int precision, int scale) { + DecimalVector vector = new DecimalVector("decimal" + Math.random(), allocator, precision, scale); + vector.allocateNew(); + for (int i = 0; i < values.length; i++) { + BigDecimal decimal = new BigDecimal(values[i]); + vector.setSafe(i, decimal); + } + + vector.setValueCount(values.length); + return vector; + } + ArrowBuf longBuf(long[] longs) { ArrowBuf buffer = allocator.buffer(longs.length * 8); for (int i = 0; i < longs.length; i++) { diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java new file mode 100644 index 0000000000000..4a4fb82951c16 --- /dev/null +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.junit.Assert; +import org.junit.Test; + +public class DecimalTypeUtilTest { + + @Test + public void testOutputTypesForAdd() { + ArrowType.Decimal operand1 = getDecimal(30, 10); + ArrowType.Decimal operand2 = getDecimal(30, 10); + ArrowType.Decimal resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(31, 10).equals(resultType)); + + operand1 = getDecimal(30, 6); + operand2 = getDecimal(30, 5); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(32, 6).equals(resultType)); + + operand1 = getDecimal(30, 10); + operand2 = getDecimal(38, 10); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(38, 9).equals(resultType)); + + operand1 = getDecimal(38, 10); + operand2 = getDecimal(38, 38); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(38, 9).equals(resultType)); + + operand1 = getDecimal(38, 10); + operand2 = getDecimal(38, 2); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(38, 6).equals(resultType)); + + } + + @Test + public void testOutputTypesForMultiply() { + ArrowType.Decimal operand1 = getDecimal(30, 10); + ArrowType.Decimal operand2 = getDecimal(30, 10); + ArrowType.Decimal resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.MULTIPLY, operand1, operand2); + Assert.assertTrue(getDecimal(38, 6).equals(resultType)); + + operand1 = getDecimal(38, 10); + operand2 = getDecimal(9, 2); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.MULTIPLY, operand1, operand2); + Assert.assertTrue(getDecimal(38, 6).equals(resultType)); + + } + + @Test + public void testOutputTypesForMod() { + ArrowType.Decimal operand1 = getDecimal(30, 10); + ArrowType.Decimal operand2 = getDecimal(28 , 7); + ArrowType.Decimal resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.MOD, operand1, operand2); + Assert.assertTrue(getDecimal(30, 10).equals(resultType)); + } + + private ArrowType.Decimal getDecimal(int precision, int scale) { + return new ArrowType.Decimal(precision, scale); + } + +} diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java index cd297034df80f..6934c3f9e7d1a 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java @@ -26,10 +26,12 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import com.google.common.collect.Lists; +@Ignore public class MicroBenchmarkTest extends BaseEvaluatorTest { private double toleranceRatio = 4.0; @@ -58,7 +60,7 @@ public void testAdd3() throws Exception { 1 * MILLION, 16 * THOUSAND, 4); System.out.println("Time taken for projecting 1m records of add3 is " + timeTaken + "ms"); - Assert.assertTrue(timeTaken <= 10 * toleranceRatio); + Assert.assertTrue(timeTaken <= 13 * toleranceRatio); } @Test diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java new file mode 100644 index 0000000000000..a3a0b4818ac22 --- /dev/null +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.gandiva.exceptions.GandivaException; +import org.apache.arrow.gandiva.expression.ExpressionTree; +import org.apache.arrow.gandiva.expression.TreeBuilder; +import org.apache.arrow.gandiva.expression.TreeNode; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; + +import com.google.common.collect.Lists; + +public class ProjectorDecimalTest extends org.apache.arrow.gandiva.evaluator.BaseEvaluatorTest { + + @Test + public void test_add() throws GandivaException { + int precision = 38; + int scale = 8; + ArrowType.Decimal decimal = new ArrowType.Decimal(precision, scale); + Field a = Field.nullable("a", decimal); + Field b = Field.nullable("b", decimal); + List args = Lists.newArrayList(a, b); + + ArrowType.Decimal outputType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, decimal, decimal); + Field retType = Field.nullable("c", outputType); + ExpressionTree root = TreeBuilder.makeExpression("add", args, retType); + + List exprs = Lists.newArrayList(root); + + Schema schema = new Schema(args); + Projector eval = Projector.make(schema, exprs); + + int numRows = 4; + byte[] validity = new byte[]{(byte) 255}; + String[] aValues = new String[]{"1.12345678","2.12345678","3.12345678","4.12345678"}; + String[] bValues = new String[]{"2.12345678","3.12345678","4.12345678","5.12345678"}; + + DecimalVector valuesa = decimalVector(aValues, precision, scale); + DecimalVector valuesb = decimalVector(bValues, precision, scale); + ArrowRecordBatch batch = + new ArrowRecordBatch( + numRows, + Lists.newArrayList(new ArrowFieldNode(numRows, 0), new ArrowFieldNode(numRows, 0)), + Lists.newArrayList(valuesa.getValidityBuffer(), valuesa.getDataBuffer(), + valuesb.getValidityBuffer(), valuesb.getDataBuffer())); + + DecimalVector outVector = new DecimalVector("decimal_output", allocator, outputType.getPrecision(), + outputType.getScale()); + outVector.allocateNew(numRows); + + List output = new ArrayList(); + output.add(outVector); + eval.evaluate(batch, output); + + // should have scaled down. + BigDecimal[] expOutput = new BigDecimal[]{BigDecimal.valueOf(3.2469136), + BigDecimal.valueOf(5.2469136), + BigDecimal.valueOf(7.2469136), + BigDecimal.valueOf(9.2469136)}; + + for (int i = 0; i < 4; i++) { + assertFalse(outVector.isNull(i)); + assertTrue("index : " + i + " failed compare", expOutput[i].compareTo(outVector.getObject(i) + ) == 0); + } + + // free buffers + releaseRecordBatch(batch); + releaseValueVectors(output); + eval.close(); + } + + @Test + public void test_add_literal() throws GandivaException { + int precision = 2; + int scale = 0; + ArrowType.Decimal decimal = new ArrowType.Decimal(precision, scale); + ArrowType.Decimal literalType = new ArrowType.Decimal(2, 1); + Field a = Field.nullable("a", decimal); + + ArrowType.Decimal outputType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, decimal, literalType); + Field retType = Field.nullable("c", outputType); + TreeNode field = TreeBuilder.makeField(a); + TreeNode literal = TreeBuilder.makeDecimalLiteral("6", 2, 1); + List args = Lists.newArrayList(field, literal); + TreeNode root = TreeBuilder.makeFunction("add", args, outputType); + ExpressionTree tree = TreeBuilder.makeExpression(root, retType); + + List exprs = Lists.newArrayList(tree); + + Schema schema = new Schema(Lists.newArrayList(a)); + Projector eval = Projector.make(schema, exprs); + + int numRows = 4; + String[] aValues = new String[]{"1", "2", "3", "4"}; + + DecimalVector valuesa = decimalVector(aValues, precision, scale); + ArrowRecordBatch batch = + new ArrowRecordBatch( + numRows, + Lists.newArrayList(new ArrowFieldNode(numRows, 0)), + Lists.newArrayList(valuesa.getValidityBuffer(), valuesa.getDataBuffer())); + + DecimalVector outVector = new DecimalVector("decimal_output", allocator, outputType.getPrecision(), + outputType.getScale()); + outVector.allocateNew(numRows); + + List output = new ArrayList(); + output.add(outVector); + eval.evaluate(batch, output); + + BigDecimal[] expOutput = new BigDecimal[]{BigDecimal.valueOf(1.6), BigDecimal.valueOf(2.6), + BigDecimal.valueOf(3.6), BigDecimal.valueOf(4.6)}; + + for (int i = 0; i < 4; i++) { + assertFalse(outVector.isNull(i)); + assertTrue(expOutput[i].compareTo(outVector.getObject(i)) == 0); + } + + // free buffers + releaseRecordBatch(batch); + releaseValueVectors(output); + eval.close(); + } +} diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java index 7c56f21741db2..d333556d2d7d2 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java @@ -131,6 +131,8 @@ public void testMakeProjectorParallel() throws GandivaException, InterruptedExce executors.awaitTermination(100, java.util.concurrent.TimeUnit.SECONDS); } + // Will be fixed by https://issues.apache.org/jira/browse/ARROW-4371 + @Ignore @Test public void testMakeProjector() throws GandivaException { Field a = Field.nullable("a", int64); diff --git a/java/gandiva/src/test/resources/logback.xml b/java/gandiva/src/test/resources/logback.xml index 9e23b555feec3..f9e449fa67b2e 100644 --- a/java/gandiva/src/test/resources/logback.xml +++ b/java/gandiva/src/test/resources/logback.xml @@ -11,9 +11,18 @@ language governing permissions and limitations under the License. --> + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + - + diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 7e2c42d997ee1..b882382e6f0b8 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT arrow-memory Arrow Memory diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index bfe97015f1218..5d37df59acfb1 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -791,7 +791,7 @@ protected short _getShort(int index) { } /** - * @see {@link #getShortLE(int)}. + * @see ArrowBuf#getShortLE(int). */ @Override protected short _getShortLE(int index) { @@ -804,7 +804,7 @@ protected int _getInt(int index) { } /** - * @see {@link #getIntLE(int)}. + * @see ArrowBuf#getIntLE(int). */ @Override protected int _getIntLE(int index) { @@ -812,7 +812,7 @@ protected int _getIntLE(int index) { } /** - * @see {@link #getUnsignedMedium(int)}. + * @see ArrowBuf#getUnsignedMedium(int). */ @Override protected int _getUnsignedMedium(int index) { @@ -820,7 +820,7 @@ protected int _getUnsignedMedium(int index) { } /** - * @see {@link #getUnsignedMediumLE(int)}. + * @see ArrowBuf#getUnsignedMediumLE(int). */ @Override protected int _getUnsignedMediumLE(int index) { @@ -833,7 +833,7 @@ protected long _getLong(int index) { } /** - * @see {@link #getLongLE(int)}. + * @see ArrowBuf#getLongLE(int). */ @Override protected long _getLongLE(int index) { @@ -851,7 +851,7 @@ protected void _setShort(int index, int value) { } /** - * @see {@link #setShortLE(int, int)}. + * @see ArrowBuf#setShortLE(int, int). */ @Override protected void _setShortLE(int index, int value) { @@ -864,7 +864,7 @@ protected void _setMedium(int index, int value) { } /** - * @see {@link #setMediumLE(int, int)}. + * @see ArrowBuf#setMediumLE(int, int). */ @Override protected void _setMediumLE(int index, int value) { @@ -877,7 +877,7 @@ protected void _setInt(int index, int value) { } /** - * @see {@link #setIntLE(int, int)}. + * @see ArrowBuf#setIntLE(int, int). */ @Override protected void _setIntLE(int index, int value) { @@ -890,7 +890,7 @@ protected void _setLong(int index, long value) { } /** - * @see {@link #setLongLE(int, long)}. + * @see ArrowBuf#setLongLE(int, long). */ @Override public void _setLongLE(int index, long value) { diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index aaa1f506fb5c2..c10d246013290 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -22,11 +22,8 @@ import java.util.IdentityHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.arrow.memory.BaseAllocator.Verbosity; -import org.apache.arrow.memory.util.AutoCloseableLock; import org.apache.arrow.memory.util.HistoricalLog; import org.apache.arrow.util.Preconditions; @@ -73,9 +70,6 @@ public class AllocationManager { // ARROW-1627 Trying to minimize memory overhead caused by previously used IdentityHashMap // see JIRA for details private final LowCostIdentityHashMap map = new LowCostIdentityHashMap<>(); - private final ReadWriteLock lock = new ReentrantReadWriteLock(); - private final AutoCloseableLock readLock = new AutoCloseableLock(lock.readLock()); - private final AutoCloseableLock writeLock = new AutoCloseableLock(lock.writeLock()); private final long amCreationTime = System.nanoTime(); private volatile BufferLedger owningLedger; @@ -115,9 +109,8 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta "A buffer can only be associated between two allocators that share the same root."); } - try (AutoCloseableLock read = readLock.open()) { - - final BufferLedger ledger = map.get(allocator); + synchronized (this) { + BufferLedger ledger = map.get(allocator); if (ledger != null) { if (retain) { ledger.inc(); @@ -125,20 +118,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta return ledger; } - } - try (AutoCloseableLock write = writeLock.open()) { - // we have to recheck existing ledger since a second reader => writer could be competing - // with us. - - final BufferLedger existingLedger = map.get(allocator); - if (existingLedger != null) { - if (retain) { - existingLedger.inc(); - } - return existingLedger; - } - - final BufferLedger ledger = new BufferLedger(allocator); + ledger = new BufferLedger(allocator); if (retain) { ledger.inc(); } @@ -153,7 +133,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta * The way that a particular BufferLedger communicates back to the AllocationManager that it * now longer needs to hold * a reference to particular piece of memory. - * Can only be called when you already hold the writeLock. + * Can only be called when you already hold the lock. */ private void release(final BufferLedger ledger) { final BaseAllocator allocator = ledger.getAllocator(); @@ -250,7 +230,7 @@ public boolean transferBalance(final BufferLedger target) { // since two balance transfers out from the allocator manager could cause incorrect // accounting, we need to ensure // that this won't happen by synchronizing on the allocator manager instance. - try (AutoCloseableLock write = writeLock.open()) { + synchronized (AllocationManager.this) { if (owningLedger != this) { return true; } @@ -330,7 +310,7 @@ public int decrement(int decrement) { allocator.assertOpen(); final int outcome; - try (AutoCloseableLock write = writeLock.open()) { + synchronized (AllocationManager.this) { outcome = bufRefCnt.addAndGet(-decrement); if (outcome == 0) { lDestructionTime = System.nanoTime(); @@ -431,7 +411,7 @@ public int getSize() { * @return Amount of accounted(owned) memory associated with this ledger. */ public int getAccountedSize() { - try (AutoCloseableLock read = readLock.open()) { + synchronized (AllocationManager.this) { if (owningLedger == this) { return size; } else { diff --git a/java/memory/src/test/resources/logback.xml b/java/memory/src/test/resources/logback.xml new file mode 100644 index 0000000000000..4c54d18a210ff --- /dev/null +++ b/java/memory/src/test/resources/logback.xml @@ -0,0 +1,28 @@ + + + + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + diff --git a/java/plasma/pom.xml b/java/plasma/pom.xml index d50171a309155..8c51fa2554557 100644 --- a/java/plasma/pom.xml +++ b/java/plasma/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT arrow-plasma Arrow Plasma Client diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java b/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java index 3b67bc08ecfdc..f933c85b8365f 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java @@ -19,6 +19,9 @@ import java.util.List; +import org.apache.arrow.plasma.exceptions.DuplicateObjectException; +import org.apache.arrow.plasma.exceptions.PlasmaOutOfMemoryException; + /** * Object store interface, which provides the capabilities to put and get raw byte array, and serves. */ @@ -42,7 +45,8 @@ class ObjectStoreData { * @param value The value to put in the object store. * @param metadata encodes whatever metadata the user wishes to encode. */ - void put(byte[] objectId, byte[] value, byte[] metadata); + void put(byte[] objectId, byte[] value, byte[] metadata) + throws DuplicateObjectException, PlasmaOutOfMemoryException; /** * Get a buffer from the PlasmaStore based on the objectId. @@ -79,16 +83,6 @@ default byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { */ List get(byte[][] objectIds, int timeoutMs); - /** - * Wait until numReturns objects in objectIds are ready. - * - * @param objectIds List of object IDs to wait for. - * @param timeoutMs Return to the caller after timeoutMs milliseconds. - * @param numReturns We are waiting for this number of objects to be ready. - * @return List of object IDs that are ready - */ - List wait(byte[][] objectIds, int timeoutMs, int numReturns); - /** * Compute the hash of an object in the object store. * @@ -98,23 +92,6 @@ default byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { */ byte[] hash(byte[] objectId); - /** - * Fetch the object with the given ID from other plasma manager instances. - * - * @param objectId The object ID used to identify the object. - */ - default void fetch(byte[] objectId) { - byte[][] objectIds = {objectId}; - fetch(objectIds); - } - - /** - * Fetch the objects with the given IDs from other plasma manager instances. - * - * @param objectIds List of object IDs used to identify the objects. - */ - void fetch(byte[][] objectIds); - /** * Evict some objects to recover given count of bytes. * diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java index db1f35e1641bb..a708f41853d75 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java @@ -19,9 +19,10 @@ import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; +import org.apache.arrow.plasma.exceptions.DuplicateObjectException; +import org.apache.arrow.plasma.exceptions.PlasmaOutOfMemoryException; /** * The PlasmaClient is used to interface with a plasma store and manager. @@ -45,18 +46,9 @@ public PlasmaClient(String storeSocketName, String managerSocketName, int releas // interface methods -------------------- @Override - public void put(byte[] objectId, byte[] value, byte[] metadata) { - ByteBuffer buf = null; - try { - buf = PlasmaClientJNI.create(conn, objectId, value.length, metadata); - } catch (Exception e) { - System.err.println("ObjectId " + objectId + " error at PlasmaClient put"); - e.printStackTrace(); - } - if (buf == null) { - return; - } - + public void put(byte[] objectId, byte[] value, byte[] metadata) + throws DuplicateObjectException, PlasmaOutOfMemoryException { + ByteBuffer buf = PlasmaClientJNI.create(conn, objectId, value.length, metadata); buf.put(value); PlasmaClientJNI.seal(conn, objectId); PlasmaClientJNI.release(conn, objectId); @@ -81,34 +73,11 @@ public List get(byte[][] objectIds, int timeoutMs, boolean isMetadata) { return ret; } - @Override - public List wait(byte[][] objectIds, int timeoutMs, int numReturns) { - byte[][] readys = PlasmaClientJNI.wait(conn, objectIds, timeoutMs, numReturns); - - List ret = new ArrayList<>(); - for (byte[] ready : readys) { - for (byte[] id : objectIds) { - if (Arrays.equals(ready, id)) { - ret.add(id); - break; - } - } - } - - assert (ret.size() == readys.length); - return ret; - } - @Override public byte[] hash(byte[] objectId) { return PlasmaClientJNI.hash(conn, objectId); } - @Override - public void fetch(byte[][] objectIds) { - PlasmaClientJNI.fetch(conn, objectIds); - } - @Override public List get(byte[][] objectIds, int timeoutMs) { ByteBuffer[][] bufs = PlasmaClientJNI.get(conn, objectIds, timeoutMs); diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClientJNI.java b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClientJNI.java index 4f7598eae2283..7f8cf8287e510 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClientJNI.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClientJNI.java @@ -19,6 +19,9 @@ import java.nio.ByteBuffer; +import org.apache.arrow.plasma.exceptions.DuplicateObjectException; +import org.apache.arrow.plasma.exceptions.PlasmaOutOfMemoryException; + /** * JNI static methods for PlasmaClient. */ @@ -28,7 +31,8 @@ public class PlasmaClientJNI { public static native void disconnect(long conn); - public static native ByteBuffer create(long conn, byte[] objectId, int size, byte[] metadata); + public static native ByteBuffer create(long conn, byte[] objectId, int size, byte[] metadata) + throws DuplicateObjectException, PlasmaOutOfMemoryException; public static native byte[] hash(long conn, byte[] objectId); diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/exceptions/DuplicateObjectException.java b/java/plasma/src/main/java/org/apache/arrow/plasma/exceptions/DuplicateObjectException.java new file mode 100644 index 0000000000000..464d54d6d5b18 --- /dev/null +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/exceptions/DuplicateObjectException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.plasma.exceptions; + +public class DuplicateObjectException extends RuntimeException { + + public DuplicateObjectException(String objectId) { + super("An object with ID " + objectId + " already exists in the plasma store."); + } + + public DuplicateObjectException(String objectId, Throwable t) { + super("An object with ID " + objectId + " already exists in the plasma store.", t); + } +} diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/exceptions/PlasmaOutOfMemoryException.java b/java/plasma/src/main/java/org/apache/arrow/plasma/exceptions/PlasmaOutOfMemoryException.java new file mode 100644 index 0000000000000..831a4caf62807 --- /dev/null +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/exceptions/PlasmaOutOfMemoryException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.plasma.exceptions; + +public class PlasmaOutOfMemoryException extends RuntimeException { + + public PlasmaOutOfMemoryException() { + super("The plasma store ran out of memory."); + } + + public PlasmaOutOfMemoryException(Throwable t) { + super("The plasma store ran out of memory.", t); + } +} diff --git a/java/plasma/src/test/java/org/apache/arrow/plasma/PlasmaClientTest.java b/java/plasma/src/test/java/org/apache/arrow/plasma/PlasmaClientTest.java index 70e277a61e478..3f326d30d834a 100644 --- a/java/plasma/src/test/java/org/apache/arrow/plasma/PlasmaClientTest.java +++ b/java/plasma/src/test/java/org/apache/arrow/plasma/PlasmaClientTest.java @@ -23,6 +23,9 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import org.apache.arrow.plasma.exceptions.DuplicateObjectException; +import org.junit.Assert; + public class PlasmaClientTest { private String storeSuffix = "/tmp/store"; @@ -142,8 +145,12 @@ public void doTest() { assert Arrays.equals(values.get(0), value1); assert Arrays.equals(values.get(1), value2); System.out.println("Plasma java client get multi-object test success."); - pLink.put(id1, value1, null); - System.out.println("Plasma java client put same object twice exception test success."); + try { + pLink.put(id1, value1, null); + Assert.fail("Fail to throw DuplicateObjectException when put an object into plasma store twice."); + } catch (DuplicateObjectException e) { + System.out.println("Plasma java client put same object twice exception test success."); + } byte[] id1Hash = pLink.hash(id1); assert id1Hash != null; System.out.println("Plasma java client hash test success."); diff --git a/java/plasma/src/test/resources/logback.xml b/java/plasma/src/test/resources/logback.xml new file mode 100644 index 0000000000000..4c54d18a210ff --- /dev/null +++ b/java/plasma/src/test/resources/logback.xml @@ -0,0 +1,28 @@ + + + + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + diff --git a/java/pom.xml b/java/pom.xml index 0df1178c1d62e..9093bfa46d7db 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -20,7 +20,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT pom Apache Arrow Java Root POM @@ -261,7 +261,7 @@ dd.MM.yyyy '@' HH:mm:ss z - true + false false true false @@ -334,6 +334,10 @@ true true + + + javax.annotation:javax.annotation-api:* + @@ -350,7 +354,7 @@ org.apache.rat apache-rat-plugin - 0.11 + 0.13 org.apache.maven.plugins @@ -530,6 +534,11 @@ slf4j-api ${dep.slf4j.version} + + javax.annotation + javax.annotation-api + 1.3.2 + diff --git a/java/tools/pom.xml b/java/tools/pom.xml index d01c95120a56a..c22f3beb3fa01 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT arrow-tools Arrow Tools diff --git a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java index 95ff71067a51d..6f68a9ef5e295 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java @@ -37,9 +37,9 @@ public class EchoServer { private boolean closed = false; public EchoServer(int port) throws IOException { - LOGGER.info("Starting echo server."); + LOGGER.debug("Starting echo server."); serverSocket = new ServerSocket(port); - LOGGER.info("Running echo server on port: " + port()); + LOGGER.debug("Running echo server on port: " + port()); } public static void main(String[] args) throws Exception { @@ -59,9 +59,9 @@ public int port() { public void run() throws IOException { try { while (!closed) { - LOGGER.info("Waiting to accept new client connection."); + LOGGER.debug("Waiting to accept new client connection."); Socket clientSocket = serverSocket.accept(); - LOGGER.info("Accepted new client connection."); + LOGGER.debug("Accepted new client connection."); try (ClientConnection client = new ClientConnection(clientSocket)) { try { client.run(); @@ -69,7 +69,7 @@ public void run() throws IOException { LOGGER.warn("Error handling client connection.", e); } } - LOGGER.info("Closed connection with client"); + LOGGER.debug("Closed connection with client"); } } catch (java.net.SocketException ex) { if (!closed) { @@ -77,7 +77,7 @@ public void run() throws IOException { } } finally { serverSocket.close(); - LOGGER.info("Server closed."); + LOGGER.debug("Server closed."); } } @@ -116,7 +116,7 @@ public void run() throws IOException { } writer.end(); Preconditions.checkState(reader.bytesRead() == writer.bytesWritten()); - LOGGER.info(String.format("Echoed %d records", echoed)); + LOGGER.debug(String.format("Echoed %d records", echoed)); } } } diff --git a/java/tools/src/test/resources/logback.xml b/java/tools/src/test/resources/logback.xml new file mode 100644 index 0000000000000..ff848da2a8be1 --- /dev/null +++ b/java/tools/src/test/resources/logback.xml @@ -0,0 +1,27 @@ + + + + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + diff --git a/java/vector/pom.xml b/java/vector/pom.xml index fc4eb8fd16746..3f3275242ec8a 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.12.0-SNAPSHOT + 0.13.0-SNAPSHOT arrow-vector Arrow Vectors diff --git a/java/vector/src/main/java/org/apache/arrow/util/AutoCloseables.java b/java/vector/src/main/java/org/apache/arrow/util/AutoCloseables.java index d89478ecc709c..2f6ee9fb52ea6 100644 --- a/java/vector/src/main/java/org/apache/arrow/util/AutoCloseables.java +++ b/java/vector/src/main/java/org/apache/arrow/util/AutoCloseables.java @@ -172,10 +172,10 @@ public static RollbackCloseable rollbackable(AutoCloseable... closeables) { } /** - * close() an {@see java.lang.AutoCloseable} without throwing a (checked) - * {@see java.lang.Exception}. This wraps the close() call with a + * close() an {@link java.lang.AutoCloseable} without throwing a (checked) + * {@link java.lang.Exception}. This wraps the close() call with a * try-catch that will rethrow an Exception wrapped with a - * {@see java.lang.RuntimeException}, providing a way to call close() + * {@link java.lang.RuntimeException}, providing a way to call close() * without having to do the try-catch everywhere or propagate the Exception. * * @param autoCloseable the AutoCloseable to close; may be null diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index bc0b77a0aeb0a..f3c2837cfa7e8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -22,7 +22,6 @@ import java.util.Collections; import java.util.List; -import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -43,8 +42,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector implements FixedWidthVector, FieldVector, VectorDefinitionSetter { private final int typeWidth; - protected int valueAllocationSizeInBytes; - protected int validityAllocationSizeInBytes; + protected int initialValueAllocation; protected final Field field; private int allocationMonitor; @@ -61,14 +59,7 @@ public BaseFixedWidthVector(final String name, final BufferAllocator allocator, allocationMonitor = 0; validityBuffer = allocator.getEmpty(); valueBuffer = allocator.getEmpty(); - if (typeWidth > 0) { - valueAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * typeWidth; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); - } else { - /* specialized handling for BitVector */ - valueAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); - validityAllocationSizeInBytes = valueAllocationSizeInBytes; - } + initialValueAllocation = INITIAL_VALUE_ALLOCATION; } @@ -159,12 +150,8 @@ public ArrowBuf getOffsetBuffer() { */ @Override public void setInitialCapacity(int valueCount) { - final long size = (long) valueCount * typeWidth; - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } - valueAllocationSizeInBytes = (int) size; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); + computeAndCheckBufferSize(valueCount); + initialValueAllocation = valueCount; } /** @@ -267,18 +254,13 @@ public void allocateNew() { */ @Override public boolean allocateNewSafe() { - long curAllocationSizeValue = valueAllocationSizeInBytes; - long curAllocationSizeValidity = validityAllocationSizeInBytes; - - if (curAllocationSizeValue > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory exceeds limit"); - } + computeAndCheckBufferSize(initialValueAllocation); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(curAllocationSizeValue, curAllocationSizeValidity); + allocateBytes(initialValueAllocation); } catch (Exception e) { clear(); return false; @@ -295,28 +277,32 @@ public boolean allocateNewSafe() { * @throws org.apache.arrow.memory.OutOfMemoryException on error */ public void allocateNew(int valueCount) { - long valueBufferSize = valueCount * typeWidth; - long validityBufferSize = getValidityBufferSizeFromCount(valueCount); - if (typeWidth == 0) { - /* specialized handling for BitVector */ - valueBufferSize = validityBufferSize; - } - - if (valueBufferSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } + computeAndCheckBufferSize(valueCount); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(valueBufferSize, validityBufferSize); + allocateBytes(valueCount); } catch (Exception e) { clear(); throw e; } } + /* + * Compute the buffer size required for 'valueCount', and check if it's within bounds. + */ + private long computeAndCheckBufferSize(int valueCount) { + final long size = computeCombinedBufferSize(valueCount, typeWidth); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + /** * Actual memory allocation is done by this function. All the calculations * and knowledge about what size to allocate is upto the callers of this @@ -326,14 +312,10 @@ public void allocateNew(int valueCount) { * within the bounds of max allocation allowed and any other error * conditions. */ - private void allocateBytes(final long valueBufferSize, final long validityBufferSize) { - /* allocate data buffer */ - int curSize = (int) valueBufferSize; - valueBuffer = allocator.buffer(curSize); - valueBuffer.readerIndex(0); - valueAllocationSizeInBytes = curSize; - /* allocate validity buffer */ - allocateValidityBuffer((int) validityBufferSize); + private void allocateBytes(int valueCount) { + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount, typeWidth); + valueBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); zeroVector(); } @@ -346,7 +328,6 @@ private void allocateBytes(final long valueBufferSize, final long validityBuffer private void allocateValidityBuffer(final int validityBufferSize) { validityBuffer = allocator.buffer(validityBufferSize); validityBuffer.readerIndex(0); - validityAllocationSizeInBytes = validityBufferSize; } /** @@ -422,43 +403,28 @@ public ArrowBuf[] getBuffers(boolean clear) { */ @Override public void reAlloc() { - valueBuffer = reallocBufferHelper(valueBuffer, true); - validityBuffer = reallocBufferHelper(validityBuffer, false); - } - - /** - * Helper method for reallocating a particular internal buffer - * Returns the new buffer. - */ - private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean dataBuffer) { - final int currentBufferCapacity = buffer.capacity(); - long baseSize = (dataBuffer ? valueAllocationSizeInBytes - : validityAllocationSizeInBytes); - - if (baseSize < (long) currentBufferCapacity) { - baseSize = (long) currentBufferCapacity; - } - - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); - assert newAllocationSize >= 1; - - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); + int targetValueCount = getValueCapacity() * 2; + if (targetValueCount == 0) { + if (initialValueAllocation > 0) { + targetValueCount = initialValueAllocation * 2; + } else { + targetValueCount = INITIAL_VALUE_ALLOCATION * 2; + } } + computeAndCheckBufferSize(targetValueCount); - final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); - newBuf.setBytes(0, buffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); - buffer.release(1); - buffer = newBuf; - if (dataBuffer) { - valueAllocationSizeInBytes = (int) newAllocationSize; - } else { - validityAllocationSizeInBytes = (int) newAllocationSize; - } + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetValueCount, typeWidth); + final ArrowBuf newValueBuffer = buffers.getDataBuf(); + newValueBuffer.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); + newValueBuffer.setZero(valueBuffer.capacity(), newValueBuffer.capacity() - valueBuffer.capacity()); + valueBuffer.release(); + valueBuffer = newValueBuffer; - return buffer; + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.release(); + validityBuffer = newValidityBuffer; } @Override @@ -511,9 +477,6 @@ public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers valueBuffer = dataBuffer.retain(allocator); valueCount = fieldNode.getLength(); - - valueAllocationSizeInBytes = valueBuffer.capacity(); - validityAllocationSizeInBytes = validityBuffer.capacity(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 4cbf4be19dfeb..4e014bbd2aefe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -20,6 +20,7 @@ import java.util.Collections; import java.util.Iterator; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.util.TransferPair; @@ -33,7 +34,14 @@ public abstract class BaseValueVector implements ValueVector { public static final String MAX_ALLOCATION_SIZE_PROPERTY = "arrow.vector.max_allocation_bytes"; public static final int MAX_ALLOCATION_SIZE = Integer.getInteger(MAX_ALLOCATION_SIZE_PROPERTY, Integer.MAX_VALUE); - public static final int INITIAL_VALUE_ALLOCATION = 4096; + /* + * For all fixed width vectors, the value and validity buffers are sliced from a single buffer. + * Similarly, for variable width vectors, the offsets and validity buffers are sliced from a + * single buffer. To ensure the single buffer is power-of-2 size, the initial value allocation + * should be less than power-of-2. For IntVectors, this comes to 3970*4 (15880) for the data + * buffer and 504 bytes for the validity buffer, totalling to 16384 (2^16). + */ + public static final int INITIAL_VALUE_ALLOCATION = 3970; protected final BufferAllocator allocator; protected final String name; @@ -98,5 +106,94 @@ protected ArrowBuf releaseBuffer(ArrowBuf buffer) { protected static int getValidityBufferSizeFromCount(final int valueCount) { return (int) Math.ceil(valueCount / 8.0); } + + /* round up to the next multiple of 8 */ + private static long roundUp8(long size) { + return ((size + 7) / 8) * 8; + } + + protected long computeCombinedBufferSize(int valueCount, int typeWidth) { + Preconditions.checkArgument(valueCount >= 0, "valueCount must be >= 0"); + Preconditions.checkArgument(typeWidth >= 0, "typeWidth must be >= 0"); + + // compute size of validity buffer. + long bufferSize = roundUp8(getValidityBufferSizeFromCount(valueCount)); + + // add the size of the value buffer. + if (typeWidth == 0) { + // for boolean type, value-buffer and validity-buffer are of same size. + bufferSize *= 2; + } else { + bufferSize += roundUp8(valueCount * typeWidth); + } + return BaseAllocator.nextPowerOfTwo(bufferSize); + } + + class DataAndValidityBuffers { + private ArrowBuf dataBuf; + private ArrowBuf validityBuf; + + DataAndValidityBuffers(ArrowBuf dataBuf, ArrowBuf validityBuf) { + this.dataBuf = dataBuf; + this.validityBuf = validityBuf; + } + + public ArrowBuf getDataBuf() { + return dataBuf; + } + + public ArrowBuf getValidityBuf() { + return validityBuf; + } + + } + + protected DataAndValidityBuffers allocFixedDataAndValidityBufs(int valueCount, int typeWidth) { + long bufferSize = computeCombinedBufferSize(valueCount, typeWidth); + assert bufferSize < MAX_ALLOCATION_SIZE; + + int validityBufferSize; + int dataBufferSize; + if (typeWidth == 0) { + validityBufferSize = dataBufferSize = (int) (bufferSize / 2); + } else { + // Due to roundup to power-of-2 allocation, the bufferSize could be greater than the + // requested size. Utilize the allocated buffer fully.; + int actualCount = (int) ((bufferSize * 8.0) / (8 * typeWidth + 1)); + do { + validityBufferSize = (int) roundUp8(getValidityBufferSizeFromCount(actualCount)); + dataBufferSize = (int) roundUp8(actualCount * typeWidth); + if (validityBufferSize + dataBufferSize <= bufferSize) { + break; + } + --actualCount; + } while (true); + } + + + /* allocate combined buffer */ + ArrowBuf combinedBuffer = allocator.buffer((int) bufferSize); + + /* slice into requested lengths */ + ArrowBuf dataBuf = null; + ArrowBuf validityBuf = null; + int bufferOffset = 0; + for (int numBuffers = 0; numBuffers < 2; ++numBuffers) { + int len = (numBuffers == 0 ? dataBufferSize : validityBufferSize); + ArrowBuf buf = combinedBuffer.slice(bufferOffset, len); + buf.retain(); + buf.readerIndex(0); + buf.writerIndex(0); + + bufferOffset += len; + if (numBuffers == 0) { + dataBuf = buf; + } else { + validityBuf = buf; + } + } + combinedBuffer.release(); + return new DataAndValidityBuffers(dataBuf, validityBuf); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 390dfe955b6ce..ac148a25c7c29 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -38,10 +38,8 @@ public abstract class BaseVariableWidthVector extends BaseValueVector implements VariableWidthVector, FieldVector, VectorDefinitionSetter { private static final int DEFAULT_RECORD_BYTE_COUNT = 8; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; - - private int valueAllocationSizeInBytes; - private int validityAllocationSizeInBytes; - private int offsetAllocationSizeInBytes; + private int initialValueAllocation; + private int initialValueAllocationSizeInBytes; /* protected members */ public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */ @@ -57,9 +55,9 @@ public abstract class BaseVariableWidthVector extends BaseValueVector public BaseVariableWidthVector(final String name, final BufferAllocator allocator, FieldType fieldType) { super(name, allocator); - valueAllocationSizeInBytes = INITIAL_BYTE_COUNT; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); - offsetAllocationSizeInBytes = (INITIAL_VALUE_ALLOCATION) * OFFSET_WIDTH; + initialValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + // -1 because we require one extra slot for the offset array. + initialValueAllocation = INITIAL_VALUE_ALLOCATION - 1; field = new Field(name, fieldType, null); valueCount = 0; lastSet = -1; @@ -155,15 +153,10 @@ public long getDataBufferAddress() { @Override public void setInitialCapacity(int valueCount) { final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } - valueAllocationSizeInBytes = (int) size; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); - /* to track the end offset of last data element in vector, we need - * an additional slot in offset buffer. - */ - offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + initialValueAllocationSizeInBytes = (int) size; + initialValueAllocation = valueCount; } /** @@ -175,17 +168,10 @@ public void setInitialCapacity(int valueCount) { @Override public void setInitialCapacity(int valueCount, double density) { long size = Math.max((long)(valueCount * density), 1L); - - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } - - valueAllocationSizeInBytes = (int) size; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); - /* to track the end offset of last data element in vector, we need - * an additional slot in offset buffer. - */ - offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + initialValueAllocationSizeInBytes = (int) size; + initialValueAllocation = valueCount; } /** @@ -376,20 +362,14 @@ public void allocateNew() { */ @Override public boolean allocateNewSafe() { - long curAllocationSizeValue = valueAllocationSizeInBytes; - long curAllocationSizeValidity = validityAllocationSizeInBytes; - long curAllocationSizeOffset = offsetAllocationSizeInBytes; - - if (curAllocationSizeValue > MAX_ALLOCATION_SIZE || - curAllocationSizeOffset > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory exceeds limit"); - } + checkDataBufferSize(initialValueAllocationSizeInBytes); + computeAndCheckOffsetsBufferSize(initialValueAllocation); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(curAllocationSizeValue, curAllocationSizeValidity, curAllocationSizeOffset); + allocateBytes(initialValueAllocationSizeInBytes, initialValueAllocation); } catch (Exception e) { clear(); return false; @@ -409,35 +389,59 @@ public boolean allocateNewSafe() { @Override public void allocateNew(int totalBytes, int valueCount) { assert totalBytes >= 0; - final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; - final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); - if (totalBytes > MAX_ALLOCATION_SIZE || - offsetBufferSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory exceeds limit"); - } + checkDataBufferSize(totalBytes); + computeAndCheckOffsetsBufferSize(valueCount); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(totalBytes, validityBufferSize, offsetBufferSize); + allocateBytes(totalBytes, valueCount); } catch (Exception e) { clear(); throw e; } } + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector " + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + } + + /* + * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's + * within bounds. + */ + private long computeAndCheckOffsetsBufferSize(int valueCount) { + /* to track the end offset of last data element in vector, we need + * an additional slot in offset buffer. + */ + final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + /* allocate the inner buffers */ - private void allocateBytes(final long valueBufferSize, final long validityBufferSize, - final long offsetBufferSize) { + private void allocateBytes(final int valueBufferSize, final int valueCount) { /* allocate data buffer */ - int curSize = (int) valueBufferSize; + int curSize = valueBufferSize; valueBuffer = allocator.buffer(curSize); valueBuffer.readerIndex(0); - valueAllocationSizeInBytes = curSize; - allocateValidityBuffer(validityBufferSize); - allocateOffsetBuffer(offsetBufferSize); + + /* allocate offset buffer and validity buffer */ + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH); + offsetBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); + initOffsetBuffer(); + initValidityBuffer(); } /* allocate offset buffer */ @@ -445,7 +449,6 @@ private void allocateOffsetBuffer(final long size) { final int curSize = (int) size; offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); - offsetAllocationSizeInBytes = curSize; initOffsetBuffer(); } @@ -454,7 +457,6 @@ private void allocateValidityBuffer(final long size) { final int curSize = (int) size; validityBuffer = allocator.buffer(curSize); validityBuffer.readerIndex(0); - validityAllocationSizeInBytes = curSize; initValidityBuffer(); } @@ -476,7 +478,7 @@ public void reAlloc() { * @throws OutOfMemoryException if the internal memory allocation fails */ public void reallocDataBuffer() { - long baseSize = valueAllocationSizeInBytes; + long baseSize = initialValueAllocationSizeInBytes; final int currentBufferCapacity = valueBuffer.capacity(); if (baseSize < (long) currentBufferCapacity) { @@ -487,15 +489,12 @@ public void reallocDataBuffer() { newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); assert newAllocationSize >= 1; - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); - } + checkDataBufferSize(newAllocationSize); final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); valueBuffer.release(); valueBuffer = newBuf; - valueAllocationSizeInBytes = (int) newAllocationSize; } /** @@ -522,40 +521,28 @@ public void reallocDataBuffer() { * @throws OutOfMemoryException if the internal memory allocation fails */ public void reallocValidityAndOffsetBuffers() { - offsetBuffer = reallocBufferHelper(offsetBuffer, true); - validityBuffer = reallocBufferHelper(validityBuffer, false); - } - - /* helper method to realloc a particular buffer. returns the allocated buffer */ - private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean offsetBuffer) { - final int currentBufferCapacity = buffer.capacity(); - long baseSize = (offsetBuffer ? offsetAllocationSizeInBytes - : validityAllocationSizeInBytes); - - if (baseSize < (long) currentBufferCapacity) { - baseSize = (long) currentBufferCapacity; - } - - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); - assert newAllocationSize >= 1; - - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); + int targetOffsetCount = (offsetBuffer.capacity() / OFFSET_WIDTH) * 2; + if (targetOffsetCount == 0) { + if (initialValueAllocation > 0) { + targetOffsetCount = 2 * (initialValueAllocation + 1); + } else { + targetOffsetCount = 2 * (INITIAL_VALUE_ALLOCATION + 1); + } } + computeAndCheckOffsetsBufferSize(targetOffsetCount); - final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); - newBuf.setBytes(0, buffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); - buffer.release(1); - buffer = newBuf; - if (offsetBuffer) { - offsetAllocationSizeInBytes = (int) newAllocationSize; - } else { - validityAllocationSizeInBytes = (int) newAllocationSize; - } + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetOffsetCount, OFFSET_WIDTH); + final ArrowBuf newOffsetBuffer = buffers.getDataBuf(); + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + newOffsetBuffer.setZero(offsetBuffer.capacity(), newOffsetBuffer.capacity() - offsetBuffer.capacity()); + offsetBuffer.release(); + offsetBuffer = newOffsetBuffer; - return buffer; + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.release(); + validityBuffer = newValidityBuffer; } /** @@ -919,7 +906,7 @@ public long getStartEnd(int index) { @Override public void setIndexDefined(int index) { while (index >= getValidityBufferValueCapacity()) { - validityBuffer = reallocBufferHelper(validityBuffer, false); + reallocValidityAndOffsetBuffers(); } BitVectorHelper.setValidityBitToOne(validityBuffer, index); } @@ -1072,7 +1059,7 @@ public void setSafe(int index, ByteBuffer value, int start, int length) { */ public void setNull(int index) { while (index >= getValidityBufferValueCapacity()) { - validityBuffer = reallocBufferHelper(validityBuffer, false); + reallocValidityAndOffsetBuffers(); } BitVectorHelper.setValidityBit(validityBuffer, index, 0); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 7aac28cbf1fc4..c6c964233419d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -91,11 +91,10 @@ public MinorType getMinorType() { @Override public void setInitialCapacity(int valueCount) { final int size = getValidityBufferSizeFromCount(valueCount); - if (size > MAX_ALLOCATION_SIZE) { + if (size * 2 > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } - valueAllocationSizeInBytes = size; - validityAllocationSizeInBytes = size; + initialValueAllocation = valueCount; } /** diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 48bc8936d9fbe..a407166c4f6d0 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -40,19 +40,18 @@ public void testTransferFixedWidth() { IntVector v1 = new IntVector("v1", childAllocator1); v1.allocateNew(); v1.setValueCount(4095); + long totalAllocatedMemory = childAllocator1.getAllocatedMemory(); IntVector v2 = new IntVector("v2", childAllocator2); v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expectedBitVector = 512; - int expectedValueVector = 4096 * 4; - assertEquals(expectedBitVector + expectedValueVector, childAllocator2.getAllocatedMemory()); + assertEquals(totalAllocatedMemory, childAllocator2.getAllocatedMemory()); } @Test - public void testTransferVariableidth() { + public void testTransferVariableWidth() { BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 100000, 100000); BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 100000, 100000); @@ -63,15 +62,12 @@ public void testTransferVariableidth() { v1.setValueCount(4001); VarCharVector v2 = new VarCharVector("v2", childAllocator2); + long memoryBeforeTransfer = childAllocator1.getAllocatedMemory(); v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expectedValueVector = 4096 * 8; - int expectedOffsetVector = 4096 * 4; - int expectedBitVector = 512; - int expected = expectedBitVector + expectedOffsetVector + expectedValueVector; - assertEquals(expected, childAllocator2.getAllocatedMemory()); + assertEquals(memoryBeforeTransfer, childAllocator2.getAllocatedMemory()); } private static class Pointer { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java b/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java index f7d3ddb397315..b10db95b6cf48 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java @@ -69,14 +69,16 @@ public void terminate() throws Exception { @Test /* NullableVarChar */ public void testCopyFromWithNulls() { - try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final VarCharVector vector2 = - newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + try (final VarCharVector vector = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); - int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(vector.getValueCapacity() >= 1); + assertEquals(0, vector.getValueCount()); + int initialCapacity = vector.getValueCapacity(); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { continue; } @@ -85,43 +87,53 @@ public void testCopyFromWithNulls() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + int capacity = vector.getValueCapacity(); + assertEquals(initialCapacity, capacity); - vector.setValueCount(4095); + vector.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector.getObject(i).toString()); } } + vector2.setInitialCapacity(initialCapacity); vector2.allocateNew(); capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(initialCapacity, capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } /* NO reAlloc() should have happened in copyFrom */ capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(initialCapacity, capacity); - vector2.setValueCount(4095); + vector2.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } } @@ -129,14 +141,16 @@ public void testCopyFromWithNulls() { @Test /* NullableVarChar */ public void testCopyFromWithNulls1() { - try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final VarCharVector vector2 = - newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + try (final VarCharVector vector = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); - int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(vector.getValueCapacity() >= 1); + assertEquals(0, vector.getValueCount()); + int initialCapacity = vector.getValueCapacity(); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { continue; } @@ -145,47 +159,57 @@ public void testCopyFromWithNulls1() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + int capacity = vector.getValueCapacity(); + assertEquals(initialCapacity, capacity); - vector.setValueCount(4095); + vector.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector.getObject(i).toString()); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024 * 10, 1024); + vector2.allocateNew((initialCapacity / 4) * 10, initialCapacity / 4); capacity = vector2.getValueCapacity(); - assertEquals(1024, capacity); + assertTrue(capacity >= initialCapacity / 4); + assertTrue(capacity < initialCapacity / 2); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } /* 2 reAllocs should have happened in copyFromSafe() */ capacity = vector2.getValueCapacity(); - assertEquals(4096, capacity); + assertTrue(capacity >= initialCapacity); - vector2.setValueCount(4095); + vector2.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } } @@ -194,28 +218,29 @@ public void testCopyFromWithNulls1() { @Test /* IntVector */ public void testCopyFromWithNulls2() { try (final IntVector vector1 = new IntVector(EMPTY_SCHEMA_PATH, allocator); - final IntVector vector2 = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + final IntVector vector2 = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, 1000 + i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -226,23 +251,24 @@ public void testCopyFromWithNulls2() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { assertEquals("unexpected value at index: " + i, 1000 + i, vector2.get(i)); @@ -254,60 +280,60 @@ public void testCopyFromWithNulls2() { @Test /* BigIntVector */ public void testCopyFromWithNulls3() { try (final BigIntVector vector1 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator); - final BigIntVector vector2 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator)) { + final BigIntVector vector2 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, 10000000000L + (long)i); + vector1.setSafe(i, 10000000000L + (long) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 10000000000L + (long)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, 10000000000L + (long) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 10000000000L + (long)i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, 10000000000L + (long) i, vector2.get(i)); } } } @@ -316,8 +342,9 @@ public void testCopyFromWithNulls3() { @Test /* BitVector */ public void testCopyFromWithNulls4() { try (final BitVector vector1 = new BitVector(EMPTY_SCHEMA_PATH, allocator); - final BitVector vector2 = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + final BitVector vector2 = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + vector1.setInitialCapacity(4096); vector1.allocateNew(); assertEquals(4096, vector1.getValueCapacity()); assertEquals(0, vector1.getValueCount()); @@ -394,60 +421,60 @@ public void testCopyFromWithNulls4() { @Test /* Float4Vector */ public void testCopyFromWithNulls5() { try (final Float4Vector vector1 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator); - final Float4Vector vector2 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { + final Float4Vector vector2 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, 100.25f + (float)i); + vector1.setSafe(i, 100.25f + (float) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 100.25f + (float)i, vector1.get(i), 0); + assertEquals("unexpected value at index: " + i, 100.25f + (float) i, vector1.get(i), 0); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 100.25f + i * 1.0f, vector2.get(i), 0); + assertEquals("unexpected value at index: " + i, 100.25f + i * 1.0f, vector2.get(i), 0); } } } @@ -456,60 +483,62 @@ public void testCopyFromWithNulls5() { @Test /* Float8Vector */ public void testCopyFromWithNulls6() { try (final Float8Vector vector1 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator); - final Float8Vector vector2 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + final Float8Vector vector2 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, 123456.7865 + (double) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 123456.7865 + (double) i, vector1.get(i), 0); + assertEquals( + "unexpected value at index: " + i, 123456.7865 + (double) i, vector1.get(i), 0); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 123456.7865 + (double) i, vector2.get(i), 0); + assertEquals( + "unexpected value at index: " + i, 123456.7865 + (double) i, vector2.get(i), 0); } } } @@ -518,30 +547,31 @@ public void testCopyFromWithNulls6() { @Test /* IntervalDayVector */ public void testCopyFromWithNulls7() { try (final IntervalDayVector vector1 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator); - final IntervalDayVector vector2 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator)) { + final IntervalDayVector vector2 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final int days = 10; final int milliseconds = 10000; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, days + i, milliseconds + i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -554,23 +584,24 @@ public void testCopyFromWithNulls7() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { final Period p = vector2.getObject(i); @@ -584,15 +615,16 @@ public void testCopyFromWithNulls7() { @Test /* IntervalYearVector */ public void testCopyFromWithNulls8() { try (final IntervalYearVector vector1 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator); - final IntervalYearVector vector2 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator)) { + final IntervalYearVector vector2 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final int interval = 30; /* 2 years 6 months */ - final Period[] periods = new Period[4096]; - for (int i = 0; i < 4096; i++) { + final Period[] periods = new Period[4096]; + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } @@ -600,18 +632,19 @@ public void testCopyFromWithNulls8() { final Period p = new Period(); final int years = (interval + i) / org.apache.arrow.vector.util.DateUtility.yearsToMonths; final int months = (interval + i) % org.apache.arrow.vector.util.DateUtility.yearsToMonths; - periods[i] = p.plusYears(years).plusMonths(months);; + periods[i] = p.plusYears(years).plusMonths(months); + ; } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -624,23 +657,24 @@ public void testCopyFromWithNulls8() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { final Period p = vector2.getObject(i); @@ -653,61 +687,61 @@ public void testCopyFromWithNulls8() { @Test /* SmallIntVector */ public void testCopyFromWithNulls9() { try (final SmallIntVector vector1 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator); - final SmallIntVector vector2 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator)) { + final SmallIntVector vector2 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final short val = 1000; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, val + (short)i); + vector1.setSafe(i, val + (short) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (short)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + (short) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (short)i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + (short) i, vector2.get(i)); } } } @@ -716,61 +750,61 @@ public void testCopyFromWithNulls9() { @Test /* TimeMicroVector */ public void testCopyFromWithNulls10() { try (final TimeMicroVector vector1 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator); - final TimeMicroVector vector2 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + final TimeMicroVector vector2 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final long val = 100485765432L; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, val + (long)i); + vector1.setSafe(i, val + (long) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long) i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector2.get(i)); } } } @@ -779,61 +813,61 @@ public void testCopyFromWithNulls10() { @Test /* TimeMilliVector */ public void testCopyFromWithNulls11() { try (final TimeMilliVector vector1 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator); - final TimeMilliVector vector2 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator)) { + final TimeMilliVector vector2 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final int val = 1000; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, val + i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + i, vector2.get(i)); } } } @@ -842,14 +876,15 @@ public void testCopyFromWithNulls11() { @Test /* TinyIntVector */ public void testCopyFromWithNulls12() { try (final TinyIntVector vector1 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator); - final TinyIntVector vector2 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator)) { + final TinyIntVector vector2 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); byte val = -128; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } @@ -857,16 +892,16 @@ public void testCopyFromWithNulls12() { val++; } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); val = -128; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -878,24 +913,24 @@ public void testCopyFromWithNulls12() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ val = -128; - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { assertEquals("unexpected value at index: " + i, val, vector2.get(i)); @@ -908,32 +943,33 @@ public void testCopyFromWithNulls12() { @Test /* DecimalVector */ public void testCopyFromWithNulls13() { try (final DecimalVector vector1 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16); - final DecimalVector vector2 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16)) { + final DecimalVector vector2 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final double baseValue = 104567897654.876543654; final BigDecimal[] decimals = new BigDecimal[4096]; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - BigDecimal decimal = new BigDecimal(baseValue + (double)i); + BigDecimal decimal = new BigDecimal(baseValue + (double) i); vector1.setSafe(i, decimal); decimals[i] = decimal; } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -945,23 +981,24 @@ public void testCopyFromWithNulls13() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { final BigDecimal decimal = vector2.getObject(i); @@ -974,61 +1011,61 @@ public void testCopyFromWithNulls13() { @Test /* TimeStampVector */ public void testCopyFromWithNulls14() { try (final TimeStampVector vector1 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator); - final TimeStampVector vector2 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + final TimeStampVector vector2 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final long val = 20145678912L; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, val + (long)i); + vector1.setSafe(i, val + (long) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long) i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector2.get(i)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index 4e8d8f0f39944..68102b1c32a46 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -774,13 +774,13 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(4096, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 5); /* use density as 4 */ vector.setInitialCapacity(512, 4); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(512 * 4, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); /** * inner value capacity we pass to data vector is 512 * 0.1 => 51 @@ -793,7 +793,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512, 0.1); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(64, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); /** * inner value capacity we pass to data vector is 512 * 0.01 => 5 @@ -806,7 +806,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512, 0.01); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(8, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); /** * inner value capacity we pass to data vector is 5 * 0.1 => 0 @@ -822,7 +822,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(5, 0.1); vector.allocateNew(); assertEquals(7, vector.getValueCapacity()); - assertEquals(1, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 4772a86356b95..30fe23cae4afd 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -32,6 +32,7 @@ import java.util.Arrays; import java.util.List; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; @@ -68,8 +69,8 @@ public void init() { private static final byte[] STR5 = "EEE5".getBytes(utf8Charset); private static final byte[] STR6 = "FFFFF6".getBytes(utf8Charset); private static final int MAX_VALUE_COUNT = - Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 4; - private static final int MAX_VALUE_COUNT_8BYTE = MAX_VALUE_COUNT / 2; + (int)(Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 7); + private static final int MAX_VALUE_COUNT_8BYTE = (int)(MAX_VALUE_COUNT / 2); @After public void terminate() throws Exception { @@ -108,7 +109,7 @@ public void testFixedType1() { vector.allocateNew(1024); initialCapacity = vector.getValueCapacity(); - assertEquals(1024, initialCapacity); + assertTrue(initialCapacity >= 1024); // Put and set a few values vector.setSafe(0, 100); @@ -124,7 +125,7 @@ public void testFixedType1() { assertEquals(104, vector.get(1023)); try { - vector.set(1024, 10000); + vector.set(initialCapacity, 10000); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -133,7 +134,7 @@ public void testFixedType1() { } try { - vector.get(1024); + vector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -142,10 +143,10 @@ public void testFixedType1() { } /* this should trigger a realloc() */ - vector.setSafe(1024, 10000); + vector.setSafe(initialCapacity, 10000); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); /* check vector data after realloc */ assertEquals(100, vector.get(0)); @@ -153,16 +154,17 @@ public void testFixedType1() { assertEquals(102, vector.get(100)); assertEquals(103, vector.get(1022)); assertEquals(104, vector.get(1023)); - assertEquals(10000, vector.get(1024)); + assertEquals(10000, vector.get(initialCapacity)); /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should have been zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { // TODO: test vector.get(i) is 0 after unsafe get added assertEquals("non-zero data not expected at index: " + i, true, vector.isNull(i)); } @@ -180,7 +182,7 @@ public void testFixedType2() { intVector.setInitialCapacity(MAX_VALUE_COUNT); try { - intVector.setInitialCapacity(MAX_VALUE_COUNT + 1); + intVector.setInitialCapacity(MAX_VALUE_COUNT * 2); } catch (OversizedAllocationException oe) { error = true; } finally { @@ -195,17 +197,18 @@ public void testFixedType2() { /* allocate 64 bytes (16 * 4) */ intVector.allocateNew(); /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, intVector.getValueCapacity()); + assertTrue(intVector.getValueCapacity() >= initialCapacity); + initialCapacity = intVector.getValueCapacity(); /* populate the vector */ int j = 1; - for (int i = 0; i < 16; i += 2) { + for (int i = 0; i < initialCapacity; i += 2) { intVector.set(i, j); j++; } try { - intVector.set(16, 9); + intVector.set(initialCapacity, j); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -215,13 +218,13 @@ public void testFixedType2() { /* check vector contents */ j = 1; - for (int i = 0; i < 16; i += 2) { + for (int i = 0; i < initialCapacity; i += 2) { assertEquals("unexpected value at index: " + i, j, intVector.get(i)); j++; } try { - intVector.get(16); + intVector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -230,26 +233,27 @@ public void testFixedType2() { } /* this should trigger a realloc() */ - intVector.setSafe(16, 9); + intVector.setSafe(initialCapacity, j); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, intVector.getValueCapacity()); + assertTrue(intVector.getValueCapacity() >= initialCapacity * 2); /* vector data should still be intact after realloc */ j = 1; - for (int i = 0; i <= 16; i += 2) { + for (int i = 0; i <= initialCapacity; i += 2) { assertEquals("unexpected value at index: " + i, j, intVector.get(i)); j++; } /* reset the vector */ + int capacityBeforeRealloc = intVector.getValueCapacity(); intVector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, intVector.getValueCapacity()); + assertEquals(capacityBeforeRealloc, intVector.getValueCapacity()); /* vector data should have been zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeRealloc; i++) { assertEquals("non-zero data not expected at index: " + i, true, intVector.isNull(i)); } } @@ -266,7 +270,7 @@ public void testFixedType3() { floatVector.setInitialCapacity(MAX_VALUE_COUNT); try { - floatVector.setInitialCapacity(MAX_VALUE_COUNT + 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 2); } catch (OversizedAllocationException oe) { error = true; } finally { @@ -281,7 +285,8 @@ public void testFixedType3() { /* allocate 64 bytes (16 * 4) */ floatVector.allocateNew(); /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); floatVector.zeroVector(); @@ -296,7 +301,7 @@ public void testFixedType3() { floatVector.set(14, 8.5f); try { - floatVector.set(16, 9.5f); + floatVector.set(initialCapacity, 9.5f); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -315,7 +320,7 @@ public void testFixedType3() { assertEquals(8.5f, floatVector.get(14), 0); try { - floatVector.get(16); + floatVector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -324,10 +329,10 @@ public void testFixedType3() { } /* this should trigger a realloc() */ - floatVector.setSafe(16, 9.5f); + floatVector.setSafe(initialCapacity, 9.5f); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); /* vector data should still be intact after realloc */ assertEquals(1.5f, floatVector.get(0), 0); @@ -338,16 +343,17 @@ public void testFixedType3() { assertEquals(6.6f, floatVector.get(10), 0); assertEquals(7.8f, floatVector.get(12), 0); assertEquals(8.5f, floatVector.get(14), 0); - assertEquals(9.5f, floatVector.get(16), 0); + assertEquals(9.5f, floatVector.get(initialCapacity), 0); /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); floatVector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); } } @@ -364,7 +370,7 @@ public void testFixedType4() { floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE); try { - floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE + 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE * 2); } catch (OversizedAllocationException oe) { error = true; } finally { @@ -379,7 +385,8 @@ public void testFixedType4() { /* allocate 128 bytes (16 * 8) */ floatVector.allocateNew(); /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); /* populate the vector */ floatVector.set(0, 1.55); @@ -392,7 +399,7 @@ public void testFixedType4() { floatVector.set(14, 8.56); try { - floatVector.set(16, 9.53); + floatVector.set(initialCapacity, 9.53); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -411,7 +418,7 @@ public void testFixedType4() { assertEquals(8.56, floatVector.get(14), 0); try { - floatVector.get(16); + floatVector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -420,10 +427,10 @@ public void testFixedType4() { } /* this should trigger a realloc() */ - floatVector.setSafe(16, 9.53); + floatVector.setSafe(initialCapacity, 9.53); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); /* vector data should still be intact after realloc */ assertEquals(1.55, floatVector.get(0), 0); @@ -434,16 +441,17 @@ public void testFixedType4() { assertEquals(6.67, floatVector.get(10), 0); assertEquals(7.87, floatVector.get(12), 0); assertEquals(8.56, floatVector.get(14), 0); - assertEquals(9.53, floatVector.get(16), 0); + assertEquals(9.53, floatVector.get(initialCapacity), 0); /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); floatVector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); } } @@ -463,36 +471,37 @@ public void testNullableFixedType1() { assertEquals(0, vector.getValueCapacity()); vector.allocateNew(); - assertEquals(initialCapacity, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); // Put and set a few values vector.set(0, 100); vector.set(1, 101); vector.set(100, 102); - vector.set(1022, 103); - vector.set(1023, 104); + vector.set(initialCapacity - 2, 103); + vector.set(initialCapacity - 1, 104); /* check vector contents */ assertEquals(100, vector.get(0)); assertEquals(101, vector.get(1)); assertEquals(102, vector.get(100)); - assertEquals(103, vector.get(1022)); - assertEquals(104, vector.get(1023)); + assertEquals(103, vector.get(initialCapacity - 2)); + assertEquals(104, vector.get(initialCapacity - 1)); int val = 0; /* check unset bits/null values */ - for (int i = 2, j = 101; i <= 99 || j <= 1021; i++, j++) { + for (int i = 2, j = 101; i <= 99 || j <= initialCapacity - 3; i++, j++) { if (i <= 99) { assertTrue(vector.isNull(i)); } - if (j <= 1021) { + if (j <= initialCapacity - 3) { assertTrue(vector.isNull(j)); } } try { - vector.set(1024, 10000); + vector.set(initialCapacity, 10000); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -501,7 +510,7 @@ public void testNullableFixedType1() { } try { - vector.get(1024); + vector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -510,39 +519,40 @@ public void testNullableFixedType1() { } /* should trigger a realloc of the underlying bitvector and valuevector */ - vector.setSafe(1024, 10000); + vector.setSafe(initialCapacity, 10000); /* check new capacity */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); /* vector contents should still be intact after realloc */ assertEquals(100, vector.get(0)); assertEquals(101, vector.get(1)); assertEquals(102, vector.get(100)); - assertEquals(103, vector.get(1022)); - assertEquals(104, vector.get(1023)); - assertEquals(10000, vector.get(1024)); + assertEquals(103, vector.get(initialCapacity - 2)); + assertEquals(104, vector.get(initialCapacity - 1)); + assertEquals(10000, vector.get(initialCapacity)); val = 0; /* check unset bits/null values */ - for (int i = 2, j = 101; i < 99 || j < 1021; i++, j++) { + for (int i = 2, j = 101; i < 99 || j < initialCapacity - 3; i++, j++) { if (i <= 99) { assertTrue(vector.isNull(i)); } - if (j <= 1021) { + if (j <= initialCapacity - 3) { assertTrue(vector.isNull(j)); } } /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -560,7 +570,8 @@ public void testNullableFixedType2() { assertEquals(0, vector.getValueCapacity()); vector.allocateNew(); - assertEquals(initialCapacity, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); /* populate the vector */ vector.set(0, 100.5f); @@ -573,7 +584,7 @@ public void testNullableFixedType2() { vector.set(14, 89.5f); try { - vector.set(16, 90.5f); + vector.set(initialCapacity, 90.5f); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -600,7 +611,7 @@ public void testNullableFixedType2() { assertTrue(vector.isNull(15)); try { - vector.get(16); + vector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -609,10 +620,10 @@ public void testNullableFixedType2() { } /* this should trigger a realloc() */ - vector.setSafe(16, 90.5f); + vector.setSafe(initialCapacity, 90.5f); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); /* vector data should still be intact after realloc */ assertEquals(100.5f, vector.get(0), 0); @@ -633,13 +644,14 @@ public void testNullableFixedType2() { assertTrue(vector.isNull(15)); /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -656,8 +668,9 @@ public void testNullableFixedType3() { assertEquals(0, vector.getValueCapacity()); /* allocate space for 4KB data (1024 * 4) */ vector.allocateNew(initialCapacity); - /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, vector.getValueCapacity()); + /* underlying buffer should be able to store 1024 values */ + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); vector.set(0, 1); vector.set(1, 2); @@ -687,7 +700,7 @@ public void testNullableFixedType3() { ArrowBuf validityVectorBuf = buffers.get(0); /* bitvector tracks 1024 integers --> 1024 bits --> 128 bytes */ - assertEquals(128, validityVectorBuf.readableBytes()); + assertTrue(validityVectorBuf.readableBytes() >= 128); assertEquals(3, validityVectorBuf.getByte(0)); // 1st and second bit defined for (int i = 1; i < 12; i++) { assertEquals(0, validityVectorBuf.getByte(i)); // nothing defined until 100 @@ -699,15 +712,15 @@ public void testNullableFixedType3() { assertEquals(-64, validityVectorBuf.getByte(127)); // 1022nd and 1023rd bit defined /* this should trigger a realloc() */ - vector.setSafe(1024, 6); + vector.setSafe(initialCapacity, 6); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); /* vector data should still be intact after realloc */ j = 1; for (int i = 0; i < (initialCapacity * 2); i++) { - if ((i > 1024) || (i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { + if ((i > 1023 && i != initialCapacity) || (i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } else { assertFalse("null data not expected at index: " + i, vector.isNull(i)); @@ -717,19 +730,20 @@ public void testNullableFixedType3() { } /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should have been zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } - vector.allocateNew(4096); + vector.allocateNew(initialCapacity * 4); // vector has been erased - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity * 4; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -764,7 +778,7 @@ public void testNullableFixedType4() { } vector.setSafe(valueCapacity, 20000000); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); for (int i = 0; i < vector.getValueCapacity(); i++) { if (i == valueCapacity) { @@ -795,14 +809,15 @@ public void testNullableFixedType4() { } } - vector.setSafe((valueCapacity * 2) + 1000, 400000000); - assertEquals(valueCapacity * 4, vector.getValueCapacity()); + int valueCapacityBeforeRealloc = vector.getValueCapacity(); + vector.setSafe(valueCapacityBeforeRealloc + 1000, 400000000); + assertTrue(vector.getValueCapacity() >= valueCapacity * 4); for (int i = 0; i < vector.getValueCapacity(); i++) { - if (i == (valueCapacity * 2 + 1000)) { + if (i == (valueCapacityBeforeRealloc + 1000)) { assertFalse("unexpected null value at index: " + i, vector.isNull(i)); assertEquals("unexpected value at index: " + i, 400000000, vector.get(i)); - } else if (i < valueCapacity * 2 && (i % 2) == 0) { + } else if (i < valueCapacityBeforeRealloc && (i % 2) == 0) { assertFalse("unexpected null value at index: " + i, vector.isNull(i)); assertEquals("unexpected value at index: " + i, baseValue + i, vector.get(i)); } else { @@ -811,13 +826,14 @@ public void testNullableFixedType4() { } /* reset the vector */ + int valueCapacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(valueCapacity * 4, vector.getValueCapacity()); + assertEquals(valueCapacityBeforeReset, vector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (valueCapacity * 4); i++) { + for (int i = 0; i < valueCapacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -936,52 +952,56 @@ public void testNullableVarType2() { @Test /* Float8Vector */ public void testReallocAfterVectorTransfer1() { try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final int initialDefaultCapacity = 4096; + int initialCapacity = 4096; boolean error = false; /* use the default capacity; 4096*8 => 32KB */ + vector.setInitialCapacity(initialCapacity); vector.allocateNew(); - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); double baseValue = 100.375; - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { vector.setSafe(i, baseValue + (double)i); } /* the above setSafe calls should not have triggered a realloc as * we are within the capacity. check the vector contents */ - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); - assertEquals(initialDefaultCapacity * 2, vector.getValueCapacity()); + vector.setSafe(initialCapacity, baseValue + (double)initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + int capacityAfterRealloc1 = vector.getValueCapacity(); - for (int i = initialDefaultCapacity + 1; i < (initialDefaultCapacity * 2); i++) { + for (int i = initialCapacity + 1; i < capacityAfterRealloc1; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 2); i++) { + for (int i = 0; i < capacityAfterRealloc1; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); - assertEquals(initialDefaultCapacity * 4, vector.getValueCapacity()); + vector.setSafe(capacityAfterRealloc1, baseValue + (double)(capacityAfterRealloc1)); + assertTrue(vector.getValueCapacity() >= initialCapacity * 4); + int capacityAfterRealloc2 = vector.getValueCapacity(); - for (int i = (initialDefaultCapacity * 2) + 1; i < (initialDefaultCapacity * 4); i++) { + for (int i = capacityAfterRealloc1 + 1; i < capacityAfterRealloc2; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 4); i++) { + for (int i = 0; i < capacityAfterRealloc2; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } @@ -997,10 +1017,10 @@ public void testReallocAfterVectorTransfer1() { /* now let's realloc the toVector */ toVector.reAlloc(); - assertEquals(initialDefaultCapacity * 8, toVector.getValueCapacity()); + assertTrue(toVector.getValueCapacity() >= initialCapacity * 8); - for (int i = 0; i < (initialDefaultCapacity * 8); i++) { - if (i < (initialDefaultCapacity * 4)) { + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i < capacityAfterRealloc2) { assertEquals(baseValue + (double)i, toVector.get(i), 0); } else { assertTrue(toVector.isNull(i)); @@ -1014,51 +1034,53 @@ public void testReallocAfterVectorTransfer1() { @Test /* Float8Vector */ public void testReallocAfterVectorTransfer2() { try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final int initialDefaultCapacity = 4096; + int initialCapacity = 4096; boolean error = false; - vector.allocateNew(initialDefaultCapacity); - - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + vector.allocateNew(initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); double baseValue = 100.375; - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { vector.setSafe(i, baseValue + (double)i); } /* the above setSafe calls should not have triggered a realloc as * we are within the capacity. check the vector contents */ - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); - assertEquals(initialDefaultCapacity * 2, vector.getValueCapacity()); + vector.setSafe(initialCapacity, baseValue + (double)initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + int capacityAfterRealloc1 = vector.getValueCapacity(); - for (int i = initialDefaultCapacity + 1; i < (initialDefaultCapacity * 2); i++) { + for (int i = initialCapacity + 1; i < capacityAfterRealloc1; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 2); i++) { + for (int i = 0; i < capacityAfterRealloc1; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); - assertEquals(initialDefaultCapacity * 4, vector.getValueCapacity()); + vector.setSafe(capacityAfterRealloc1, baseValue + (double)(capacityAfterRealloc1)); + assertTrue(vector.getValueCapacity() >= initialCapacity * 4); + int capacityAfterRealloc2 = vector.getValueCapacity(); - for (int i = (initialDefaultCapacity * 2) + 1; i < (initialDefaultCapacity * 4); i++) { + for (int i = capacityAfterRealloc1 + 1; i < capacityAfterRealloc2; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 4); i++) { + for (int i = 0; i < capacityAfterRealloc2; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } @@ -1073,7 +1095,7 @@ public void testReallocAfterVectorTransfer2() { Float8Vector toVector = (Float8Vector)transferPair.getTo(); /* check toVector contents before realloc */ - for (int i = 0; i < (initialDefaultCapacity * 4); i++) { + for (int i = 0; i < toVector.getValueCapacity(); i++) { assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); double value = toVector.get(i); assertEquals("unexpected value at index: " + i, baseValue + (double)i, value, 0); @@ -1081,10 +1103,10 @@ public void testReallocAfterVectorTransfer2() { /* now let's realloc the toVector and check contents again */ toVector.reAlloc(); - assertEquals(initialDefaultCapacity * 8, toVector.getValueCapacity()); + assertTrue(toVector.getValueCapacity() >= initialCapacity * 8); - for (int i = 0; i < (initialDefaultCapacity * 8); i++) { - if (i < (initialDefaultCapacity * 4)) { + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i < capacityAfterRealloc2) { assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); double value = toVector.get(i); assertEquals("unexpected value at index: " + i, baseValue + (double)i, value, 0); @@ -1103,7 +1125,7 @@ public void testReallocAfterVectorTransfer3() { /* 4096 values with 10 byte per record */ vector.allocateNew(4096 * 10, 4096); int valueCapacity = vector.getValueCapacity(); - assertEquals(4096, valueCapacity); + assertTrue(valueCapacity >= 4096); /* populate the vector */ for (int i = 0; i < valueCapacity; i++) { @@ -1125,7 +1147,10 @@ public void testReallocAfterVectorTransfer3() { /* trigger first realloc */ vector.setSafe(valueCapacity, STR2, 0, STR2.length); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1148,7 +1173,10 @@ public void testReallocAfterVectorTransfer3() { /* trigger second realloc */ vector.setSafe(valueCapacity + 10, STR2, 0, STR2.length); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1197,7 +1225,7 @@ public void testReallocAfterVectorTransfer4() { /* 4096 values */ vector.allocateNew(4096); int valueCapacity = vector.getValueCapacity(); - assertEquals(4096, valueCapacity); + assertTrue(valueCapacity >= 4096); /* populate the vector */ int baseValue = 1000; @@ -1218,7 +1246,7 @@ public void testReallocAfterVectorTransfer4() { /* trigger first realloc */ vector.setSafe(valueCapacity, 10000000); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1239,7 +1267,7 @@ public void testReallocAfterVectorTransfer4() { /* trigger second realloc */ vector.setSafe(valueCapacity, 10000000); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1288,7 +1316,8 @@ public void testReAllocFixedWidthVector() { try (final Float4Vector vector = newVector(Float4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator)) { vector.allocateNew(1024); - assertEquals(1024, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 1024); + int initialCapacity = vector.getValueCapacity(); // Put values in indexes that fall within the initial allocation vector.setSafe(0, 100.1f); @@ -1299,7 +1328,7 @@ public void testReAllocFixedWidthVector() { vector.setSafe(2000, 105.5f); // Check valueCapacity is more than initial allocation - assertEquals(1024 * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); assertEquals(100.1f, vector.get(0), 0); assertEquals(102.3f, vector.get(100), 0); @@ -1316,24 +1345,24 @@ public void testReAllocFixedWidthVector() { @Test public void testReAllocVariableWidthVector() { try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertTrue(initialCapacity >= 4095); /* Put values in indexes that fall within the initial allocation */ vector.setSafe(0, STR1, 0, STR1.length); vector.setSafe(initialCapacity - 1, STR2, 0, STR2.length); /* the above set calls should NOT have triggered a realloc */ - initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertEquals(initialCapacity, vector.getValueCapacity()); /* Now try to put values in space that falls beyond the initial allocation */ vector.setSafe(initialCapacity + 200, STR3, 0, STR3.length); /* Check valueCapacity is more than initial allocation */ - assertEquals(((initialCapacity + 1) * 2) - 1, vector.getValueCapacity()); + assertTrue(initialCapacity * 2 <= vector.getValueCapacity()); assertArrayEquals(STR1, vector.get(0)); assertArrayEquals(STR2, vector.get(initialCapacity - 1)); @@ -1348,20 +1377,20 @@ public void testReAllocVariableWidthVector() { @Test public void testFillEmptiesNotOverfill() { try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertTrue(initialCapacity >= 4095); vector.setSafe(4094, "hello".getBytes(), 0, 5); /* the above set method should NOT have trigerred a realloc */ - initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertEquals(initialCapacity, vector.getValueCapacity()); - vector.setValueCount(4095); - assertEquals(4096 * vector.OFFSET_WIDTH, vector.getFieldBuffers().get(1).capacity()); - initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + int bufSizeBefore = vector.getFieldBuffers().get(1).capacity(); + vector.setValueCount(initialCapacity); + assertEquals(bufSizeBefore, vector.getFieldBuffers().get(1).capacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); } } @@ -1371,11 +1400,12 @@ public void testCopyFromWithNulls() { final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(capacity >= 4095); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { continue; } @@ -1384,12 +1414,11 @@ public void testCopyFromWithNulls() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(capacity, vector.getValueCapacity()); - vector.setValueCount(4095); + vector.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { @@ -1397,11 +1426,12 @@ public void testCopyFromWithNulls() { } } + vector2.setInitialCapacity(4095); vector2.allocateNew(); - capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + int capacity2 = vector2.getValueCapacity(); + assertEquals(capacity2, capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); @@ -1411,12 +1441,11 @@ public void testCopyFromWithNulls() { } /* NO reAlloc() should have happened in copyFrom */ - capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(capacity, vector2.getValueCapacity()); - vector2.setValueCount(4095); + vector2.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { @@ -1432,11 +1461,12 @@ public void testCopyFromWithNulls1() { final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(capacity >= 4095); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { continue; } @@ -1445,12 +1475,11 @@ public void testCopyFromWithNulls1() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(capacity, vector.getValueCapacity()); - vector.setValueCount(4095); + vector.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { @@ -1463,10 +1492,11 @@ public void testCopyFromWithNulls1() { */ vector2.allocateNew(1024 * 10, 1024); - capacity = vector2.getValueCapacity(); - assertEquals(1024, capacity); + int capacity2 = vector2.getValueCapacity(); + assertTrue(capacity2 >= 1024); + assertTrue(capacity2 <= capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); @@ -1476,12 +1506,11 @@ public void testCopyFromWithNulls1() { } /* 2 reAllocs should have happened in copyFromSafe() */ - capacity = vector2.getValueCapacity(); - assertEquals(4096, capacity); + assertEquals(capacity, vector2.getValueCapacity()); - vector2.setValueCount(4095); + vector2.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { @@ -1876,30 +1905,88 @@ public void testSetInitialCapacity() { try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { /* use the default 8 data bytes on average per element */ - vector.setInitialCapacity(4096); + int defaultCapacity = BaseValueVector.INITIAL_VALUE_ALLOCATION - 1; + vector.setInitialCapacity(defaultCapacity); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(4096 * 8, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo(defaultCapacity * 8), vector.getDataBuffer().capacity()); - vector.setInitialCapacity(4096, 1); + vector.setInitialCapacity(defaultCapacity, 1); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(4096, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo(defaultCapacity), vector.getDataBuffer().capacity()); - vector.setInitialCapacity(4096, 0.1); + vector.setInitialCapacity(defaultCapacity, 0.1); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(512, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo((int)(defaultCapacity * 0.1)), vector.getDataBuffer().capacity()); - vector.setInitialCapacity(4096, 0.01); + vector.setInitialCapacity(defaultCapacity, 0.01); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(64, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo((int)(defaultCapacity * 0.01)), vector.getDataBuffer().capacity()); vector.setInitialCapacity(5, 0.01); vector.allocateNew(); - assertEquals(7, vector.getValueCapacity()); + assertEquals(5, vector.getValueCapacity()); assertEquals(2, vector.getDataBuffer().capacity()); } } + + @Test + public void testDefaultAllocNewAll() { + int defaultCapacity = BaseFixedWidthVector.INITIAL_VALUE_ALLOCATION; + int expectedSize; + long beforeSize; + try (BufferAllocator childAllocator = allocator.newChildAllocator("defaultAllocs", 0, Long.MAX_VALUE); + final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH, childAllocator); + final BigIntVector bigIntVector = new BigIntVector(EMPTY_SCHEMA_PATH, childAllocator); + final BitVector bitVector = new BitVector(EMPTY_SCHEMA_PATH, childAllocator); + final DecimalVector decimalVector = new DecimalVector(EMPTY_SCHEMA_PATH, childAllocator, 38, 6); + final VarCharVector varCharVector = new VarCharVector(EMPTY_SCHEMA_PATH, childAllocator)) { + + // verify that the wastage is within bounds for IntVector. + beforeSize = childAllocator.getAllocatedMemory(); + intVector.allocateNew(); + assertTrue(intVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * IntVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for BigIntVector. + beforeSize = childAllocator.getAllocatedMemory(); + bigIntVector.allocateNew(); + assertTrue(bigIntVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * bigIntVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for DecimalVector. + beforeSize = childAllocator.getAllocatedMemory(); + decimalVector.allocateNew(); + assertTrue(decimalVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * decimalVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for VarCharVector. + // var char vector have an offsets array that is 1 less than defaultCapacity + beforeSize = childAllocator.getAllocatedMemory(); + varCharVector.allocateNew(); + assertTrue(varCharVector.getValueCapacity() >= defaultCapacity - 1); + expectedSize = (defaultCapacity * VarCharVector.OFFSET_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity) + + defaultCapacity * 8; + // wastage should be less than 5%. + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for BitVector. + beforeSize = childAllocator.getAllocatedMemory(); + bitVector.allocateNew(); + assertTrue(bitVector.getValueCapacity() >= defaultCapacity); + expectedSize = BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity) * 2; + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index 5474675fbf343..60747aaad92ce 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; import java.nio.charset.StandardCharsets; @@ -54,20 +55,21 @@ public void testFixedType() { vector.setInitialCapacity(512); vector.allocateNew(); - assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); try { - vector.set(512, 0); + vector.set(initialCapacity, 0); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok } vector.reAlloc(); - assertEquals(1024, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); - vector.set(512, 100); - assertEquals(100, vector.get(512)); + vector.set(initialCapacity, 100); + assertEquals(100, vector.get(initialCapacity)); } } @@ -77,20 +79,21 @@ public void testNullableType() { vector.setInitialCapacity(512); vector.allocateNew(); - assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); try { - vector.set(512, "foo".getBytes(StandardCharsets.UTF_8)); + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok } vector.reAlloc(); - assertEquals(1024, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); - vector.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - assertEquals("foo", new String(vector.get(512), StandardCharsets.UTF_8)); + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + assertEquals("foo", new String(vector.get(initialCapacity), StandardCharsets.UTF_8)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index b7215ce4e2e68..61c1b924f664d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -974,11 +974,16 @@ public void testSingleStructWriter1() { Float4Vector float4Vector = (Float4Vector)parent.getChild("float4Field"); Float8Vector float8Vector = (Float8Vector)parent.getChild("float8Field"); - assertEquals(initialCapacity, singleStructWriter.getValueCapacity()); - assertEquals(initialCapacity, intVector.getValueCapacity()); - assertEquals(initialCapacity, bigIntVector.getValueCapacity()); - assertEquals(initialCapacity, float4Vector.getValueCapacity()); - assertEquals(initialCapacity, float8Vector.getValueCapacity()); + int capacity = singleStructWriter.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = intVector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = bigIntVector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = float4Vector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = float8Vector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); StructReader singleStructReader = new SingleStructReaderImpl(parent); diff --git a/java/vector/src/test/resources/logback.xml b/java/vector/src/test/resources/logback.xml new file mode 100644 index 0000000000000..f9e449fa67b2e --- /dev/null +++ b/java/vector/src/test/resources/logback.xml @@ -0,0 +1,28 @@ + + + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + diff --git a/js/.gitignore b/js/.gitignore index 3437e39da6c0a..5e412f8ee8a57 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -23,7 +23,8 @@ npm-debug.log* yarn-debug.log* yarn-error.log* -.vscode +.vscode/** +!.vscode/launch.json # Runtime data pids @@ -78,10 +79,13 @@ yarn.lock .env # compilation targets +doc dist targets # test data files -test/data/ +test/data/**/*.json +test/data/**/*.arrow + # jest snapshots (too big) test/__snapshots__/ diff --git a/js/.vscode/launch.json b/js/.vscode/launch.json new file mode 100644 index 0000000000000..ba5609e0c10e8 --- /dev/null +++ b/js/.vscode/launch.json @@ -0,0 +1,169 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "node", + "request": "launch", + "name": "Debug Gulp Build", + "program": "${workspaceFolder}/node_modules/gulp/bin/gulp.js", + "args": [ + "build", + // Specify we want to debug the "src" target, which won't clean or build -- essentially a "dry-run" of the gulp build + "--target", "src" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug Unit Tests", + "cwd": "${workspaceRoot}", + "program": "${workspaceFolder}/node_modules/.bin/jest", + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "env": { + "NODE_NO_WARNINGS": "1", + "READABLE_STREAM": "disable", + "TEST_DOM_STREAMS": "true", + "TEST_NODE_STREAMS": "true", + // Modify these environment variables to run tests on a specific compilation target + module format combo + "TEST_TS_SOURCE": "true", + // "TEST_TS_SOURCE": "false", + // "TEST_TARGET": "es5", + // "TEST_MODULE": "umd" + }, + "args": [ + // "-i", + "test/unit/", + + // Uncomment any of these to run individual test suites + // "test/unit/int-tests.ts", + // "test/unit/table-tests.ts", + // "test/unit/generated-data-tests.ts", + + // "test/unit/vector/vector-tests.ts", + // "test/unit/vector/bool-vector-tests.ts", + // "test/unit/vector/date-vector-tests.ts", + // "test/unit/vector/float16-vector-tests.ts", + // "test/unit/vector/numeric-vector-tests.ts", + + // "test/unit/visitor-tests.ts", + + // "test/unit/ipc/message-reader-tests.ts", + // "test/unit/ipc/reader/file-reader-tests.ts", + // "test/unit/ipc/reader/json-reader-tests.ts", + // "test/unit/ipc/reader/from-inference-tests.ts", + // "test/unit/ipc/reader/stream-reader-tests.ts", + // "test/unit/ipc/reader/streams-dom-tests.ts", + // "test/unit/ipc/reader/streams-node-tests.ts", + // "test/unit/ipc/writer/file-writer-tests.ts", + // "test/unit/ipc/writer/json-writer-tests.ts", + // "test/unit/ipc/writer/stream-writer-tests.ts", + // "test/unit/ipc/writer/streams-dom-tests.ts", + // "test/unit/ipc/writer/streams-node-tests.ts", + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug Integration Tests", + "cwd": "${workspaceRoot}", + "program": "${workspaceFolder}/bin/integration.js", + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "env": { + "NODE_NO_WARNINGS": "1", + "READABLE_STREAM": "disable" + }, + "args": [ + "--mode", "VALIDATE" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/arrow2csv", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "console": "integratedTerminal", + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/src/bin/arrow2csv.ts", + "-f", "./test/data/cpp/stream/simple.arrow" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/file-to-stream", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/file-to-stream.js", + "./test/data/cpp/file/struct_example.arrow", + "./struct_example-stream-out.arrow", + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/stream-to-file", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/stream-to-file.js", + "./test/data/cpp/stream/struct_example.arrow", + "./struct_example-file-out.arrow", + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/json-to-arrow", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/json-to-arrow.js", + "-j", "./test/data/json/struct_example.json", + "-a", "./struct_example-stream-out.arrow", + "-f", "stream" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/print-buffer-alignment", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/print-buffer-alignment.js", + "./test/data/cpp/stream/struct_example.arrow" + ] + } + ] +} diff --git a/js/README.md b/js/README.md index 15d7ed03f65a4..0af4fecabccc9 100644 --- a/js/README.md +++ b/js/README.md @@ -49,7 +49,7 @@ Check out our [API documentation][7] to learn more about how to use Apache Arrow ### Get a table from an Arrow file on disk (in IPC format) -```es6 +```js import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; @@ -70,7 +70,7 @@ null, null, null ### Create a Table when the Arrow file is split across buffers -```es6 +```js import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; @@ -93,12 +93,24 @@ console.log(table.toString()); ### Create a Table from JavaScript arrays -```es6 +```js +import { + Table, + FloatVector, + DateVector +} from 'apache-arrow'; + const LENGTH = 2000; -const rainAmounts = Float32Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1))); -const rainDates = Array.from({length: LENGTH}, (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i)); -const rainfall = arrow.Table.fromVectors( +const rainAmounts = Float32Array.from( + { length: LENGTH }, + () => Number((Math.random() * 20).toFixed(1))); + +const rainDates = Array.from( + { length: LENGTH }, + (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i)); + +const rainfall = Table.fromVectors( [FloatVector.from(rainAmounts), DateVector.from(rainDates)], ['precipitation', 'date'] ); @@ -106,20 +118,17 @@ const rainfall = arrow.Table.fromVectors( ### Load data with `fetch` -```es6 +```js import { Table } from "apache-arrow"; -fetch(require("simple.arrow")).then(response => { - response.arrayBuffer().then(buffer => { - const table = Table.from(new Uint8Array(buffer)); - console.log(table.toString()); - }); -}); +const table = await Table.from(fetch(("/simple.arrow"))); +console.log(table.toString()); + ``` ### Columns look like JS Arrays -```es6 +```js import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; @@ -131,7 +140,7 @@ const table = Table.from([ const column = table.getColumn('origin_lat'); // Copy the data into a TypedArray -const typed = column.slice(); +const typed = column.toArray(); assert(typed instanceof Float32Array); for (let i = -1, n = column.length; ++i < n;) { @@ -141,7 +150,7 @@ for (let i = -1, n = column.length; ++i < n;) { ### Usage with MapD Core -```es6 +```js import MapD from 'rxjs-mapd'; import { Table } from 'apache-arrow'; @@ -164,7 +173,7 @@ MapD.open(host, port) ) .map(([schema, records]) => // Create Arrow Table from results - Table.from(schema, records)) + Table.from([schema, records])) .map((table) => // Stringify the table to CSV with row numbers table.toString({ index: true })) diff --git a/js/bin/arrow2csv.js b/js/bin/arrow2csv.js new file mode 100755 index 0000000000000..0e446fabe7958 --- /dev/null +++ b/js/bin/arrow2csv.js @@ -0,0 +1,28 @@ +#! /usr/bin/env node + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const Path = require(`path`); +const here = Path.resolve(__dirname, '../'); +const tsnode = require.resolve(`ts-node/register`); +const arrow2csv = Path.join(here, `src/bin/arrow2csv.ts`); +const env = { ...process.env, TS_NODE_TRANSPILE_ONLY: `true` }; + +require('child_process').spawn(`node`, [ + `-r`, tsnode, arrow2csv, ...process.argv.slice(2) +], { cwd: here, env, stdio: `inherit` }); diff --git a/js/bin/file-to-stream.js b/js/bin/file-to-stream.js index fa4e5d17bbd3a..090cd0b0eda77 100755 --- a/js/bin/file-to-stream.js +++ b/js/bin/file-to-stream.js @@ -17,21 +17,24 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); const path = require('path'); - -const encoding = 'binary'; -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const { util: { PipeIterator } } = require(`../index${ext}`); -const { Table, serializeStream, fromReadableStream } = require(`../index${ext}`); +const eos = require('util').promisify(require('stream').finished); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; +const { RecordBatchReader, RecordBatchStreamWriter } = require(`../index${extension}`); (async () => { - // Todo (ptaylor): implement `serializeStreamAsync` that accepts an - // AsyncIterable, rather than aggregating into a Table first - const in_ = process.argv.length < 3 - ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); - const out = process.argv.length < 4 - ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); - new PipeIterator(serializeStream(await Table.fromAsync(fromReadableStream(in_))), encoding).pipe(out); + + const readable = process.argv.length < 3 ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); + const writable = process.argv.length < 4 ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); + + const fileToStream = readable + .pipe(RecordBatchReader.throughNode()) + .pipe(RecordBatchStreamWriter.throughNode()) + .pipe(writable); + + await eos(fileToStream); })().catch((e) => { console.error(e); process.exit(1); }); diff --git a/js/bin/integration.js b/js/bin/integration.js index 6c064deac258d..c6f6cd7a24ed5 100755 --- a/js/bin/integration.js +++ b/js/bin/integration.js @@ -17,61 +17,55 @@ // specific language governing permissions and limitations // under the License. +// @ts-nocheck + const fs = require('fs'); -const glob = require('glob'); -const path = require('path'); -const child_process = require(`child_process`); +const Path = require('path'); +const { promisify } = require('util'); +const glob = promisify(require('glob')); +const { zip } = require('ix/iterable/zip'); +const { parse: bignumJSONParse } = require('json-bignum'); const argv = require(`command-line-args`)(cliOpts(), { partial: true }); -const gulpPath = require.resolve(path.join(`..`, `node_modules/gulp/bin/gulp.js`)); - -let jsonPaths = [...(argv.json || [])]; -let arrowPaths = [...(argv.arrow || [])]; +const { + Table, + RecordBatchReader, + util: { createElementComparator } +} = require('../targets/apache-arrow/Arrow.es5.min'); -if (!argv.mode) { - return print_usage(); +const exists = async (p) => { + try { + return !!(await fs.promises.stat(p)); + } catch (e) { return false; } } -let mode = argv.mode.toUpperCase(); -if (mode === 'VALIDATE' && !jsonPaths.length) { - jsonPaths = glob.sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)); - if (!arrowPaths.length) { - [jsonPaths, arrowPaths] = jsonPaths.reduce(([jsonPaths, arrowPaths], jsonPath) => { - const { name } = path.parse(jsonPath); - for (const source of ['cpp', 'java']) { - for (const format of ['file', 'stream']) { - const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); - if (fs.existsSync(arrowPath)) { - jsonPaths.push(jsonPath); - arrowPaths.push(arrowPath); - } - } - } - return [jsonPaths, arrowPaths]; - }, [[], []]); - console.log(`jsonPaths: [\n\t${jsonPaths.join('\n\t')}\n]`); - console.log(`arrowPaths: [\n\t${arrowPaths.join('\n\t')}\n]`); +(async () => { + + if (!argv.mode) { return print_usage(); } + + let mode = argv.mode.toUpperCase(); + let jsonPaths = [...(argv.json || [])]; + let arrowPaths = [...(argv.arrow || [])]; + + if (mode === 'VALIDATE' && !jsonPaths.length) { + [jsonPaths, arrowPaths] = await loadLocalJSONAndArrowPathsForDebugging(jsonPaths, arrowPaths); } -} else if (!jsonPaths.length) { - return print_usage(); -} -switch (mode) { - case 'VALIDATE': - const args = [`test`, `-i`].concat(argv._unknown || []); - jsonPaths.forEach((p, i) => { - args.push('-j', p, '-a', arrowPaths[i]); - }); - process.exitCode = child_process.spawnSync( - gulpPath, args, - { - cwd: path.resolve(__dirname, '..'), - stdio: ['ignore', 'inherit', 'inherit'] + if (!jsonPaths.length) { return print_usage(); } + + switch (mode) { + case 'VALIDATE': + for (let [jsonPath, arrowPath] of zip(jsonPaths, arrowPaths)) { + await validate(jsonPath, arrowPath); } - ).status || process.exitCode || 0; - break; - default: - print_usage(); -} + break; + default: + return print_usage(); + } +})() +.then((x) => +x || 0, (e) => { + e && process.stderr.write(`${e && e.stack || e}\n`); + return process.exitCode || 1; +}).then((code) => process.exit(code)); function cliOpts() { return [ @@ -118,5 +112,144 @@ function print_usage() { ] }, ])); - process.exit(1); + return 1; +} + +async function validate(jsonPath, arrowPath) { + + const files = await Promise.all([ + fs.promises.readFile(arrowPath), + fs.promises.readFile(jsonPath, 'utf8'), + ]); + + const arrowData = files[0]; + const jsonData = bignumJSONParse(files[1]); + + validateReaderIntegration(jsonData, arrowData); + validateTableFromBuffersIntegration(jsonData, arrowData); + validateTableToBuffersIntegration('json', 'file')(jsonData, arrowData); + validateTableToBuffersIntegration('json', 'file')(jsonData, arrowData); + validateTableToBuffersIntegration('binary', 'file')(jsonData, arrowData); + validateTableToBuffersIntegration('binary', 'file')(jsonData, arrowData); +} + +function validateReaderIntegration(jsonData, arrowBuffer) { + const msg = `json and arrow record batches report the same values`; + try { + const jsonReader = RecordBatchReader.from(jsonData); + const binaryReader = RecordBatchReader.from(arrowBuffer); + for (const [jsonRecordBatch, binaryRecordBatch] of zip(jsonReader, binaryReader)) { + compareTableIsh(jsonRecordBatch, binaryRecordBatch); + } + } catch (e) { throw new Error(`${msg}: fail \n ${e && e.stack || e}`); } + process.stdout.write(`${msg}: pass\n`); +} + +function validateTableFromBuffersIntegration(jsonData, arrowBuffer) { + const msg = `json and arrow tables report the same values`; + try { + const jsonTable = Table.from(jsonData); + const binaryTable = Table.from(arrowBuffer); + compareTableIsh(jsonTable, binaryTable); + } catch (e) { throw new Error(`${msg}: fail \n ${e && e.stack || e}`); } + process.stdout.write(`${msg}: pass\n`); +} + +function validateTableToBuffersIntegration(srcFormat, arrowFormat) { + const refFormat = srcFormat === `json` ? `binary` : `json`; + return function testTableToBuffersIntegration(jsonData, arrowBuffer) { + const msg = `serialized ${srcFormat} ${arrowFormat} reports the same values as the ${refFormat} ${arrowFormat}`; + try { + const refTable = Table.from(refFormat === `json` ? jsonData : arrowBuffer); + const srcTable = Table.from(srcFormat === `json` ? jsonData : arrowBuffer); + const dstTable = Table.from(srcTable.serialize(`binary`, arrowFormat === `stream`)); + compareTableIsh(dstTable, refTable); + } catch (e) { throw new Error(`${msg}: fail \n ${e && e.stack || e}`); } + process.stdout.write(`${msg}: pass\n`); + }; +} + +function compareTableIsh(actual, expected) { + if (actual.length !== expected.length) { + throw new Error(`length: ${actual.length} !== ${expected.length}`); + } + if (actual.numCols !== expected.numCols) { + throw new Error(`numCols: ${actual.numCols} !== ${expected.numCols}`); + } + (() => { + const getChildAtFn = expected instanceof Table ? 'getColumnAt' : 'getChildAt'; + for (let i = -1, n = actual.numCols; ++i < n;) { + const v1 = actual[getChildAtFn](i); + const v2 = expected[getChildAtFn](i); + compareVectors(v1, v2); + } + })(); +} + +function compareVectors(actual, expected) { + + if ((actual == null && expected != null) || (expected == null && actual != null)) { + throw new Error(`${actual == null ? `actual` : `expected`} is null, was expecting ${actual == null ? expected : actual} to be that also`); + } + + let props = ['type', 'length', 'nullCount']; + + (() => { + for (let i = -1, n = props.length; ++i < n;) { + const prop = props[i]; + if (`${actual[prop]}` !== `${expected[prop]}`) { + throw new Error(`${prop}: ${actual[prop]} !== ${expected[prop]}`); + } + } + })(); + + (() => { + for (let i = -1, n = actual.length; ++i < n;) { + let x1 = actual.get(i), x2 = expected.get(i); + if (!createElementComparator(x2)(x1)) { + throw new Error(`${i}: ${x1} !== ${x2}`); + } + } + })(); + + (() => { + let i = -1; + for (let [x1, x2] of zip(actual, expected)) { + ++i; + if (!createElementComparator(x2)(x1)) { + throw new Error(`${i}: ${x1} !== ${x2}`); + } + } + })(); +} + +async function loadLocalJSONAndArrowPathsForDebugging(jsonPaths, arrowPaths) { + + const sourceJSONPaths = await glob(Path.resolve(__dirname, `../test/data/json/`, `*.json`)); + + if (!arrowPaths.length) { + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'cpp', 'file'); + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'java', 'file'); + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'cpp', 'stream'); + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'java', 'stream'); + } + + for (let [jsonPath, arrowPath] of zip(jsonPaths, arrowPaths)) { + console.log(`jsonPath: ${jsonPath}`); + console.log(`arrowPath: ${arrowPath}`); + } + + return [jsonPaths, arrowPaths]; + + async function loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, source, format) { + for (const jsonPath of sourceJSONPaths) { + const { name } = Path.parse(jsonPath); + const arrowPath = Path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); + if (await exists(arrowPath)) { + jsonPaths.push(jsonPath); + arrowPaths.push(arrowPath); + } + } + return [jsonPaths, arrowPaths]; + } } diff --git a/js/bin/json-to-arrow.js b/js/bin/json-to-arrow.js index f28b4145ffaed..7a98d56d1a5e2 100755 --- a/js/bin/json-to-arrow.js +++ b/js/bin/json-to-arrow.js @@ -17,37 +17,46 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); -const glob = require('glob'); -const path = require('path'); -const { promisify } = require('util'); +const Path = require('path'); const { parse } = require('json-bignum'); +const eos = require('util').promisify(require('stream').finished); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; const argv = require(`command-line-args`)(cliOpts(), { partial: true }); +const { RecordBatchReader, RecordBatchFileWriter, RecordBatchStreamWriter } = require(`../index${extension}`); -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const { Table } = require(`../index${ext}`); - -const encoding = 'binary'; -const stream = argv.format === 'stream'; const jsonPaths = [...(argv.json || [])]; const arrowPaths = [...(argv.arrow || [])]; -if (!jsonPaths.length || !arrowPaths.length || (jsonPaths.length !== arrowPaths.length)) { - return print_usage(); -} +(async () => { -const readFile = callResolved(promisify(fs.readFile)); -const writeFile = callResolved(promisify(fs.writeFile)); + if (!jsonPaths.length || !arrowPaths.length || (jsonPaths.length !== arrowPaths.length)) { + return print_usage(); + } -(async () => await Promise.all(jsonPaths.map(async (jPath, i) => { - const aPath = arrowPaths[i]; - const arrowTable = Table.from(parse('' + (await readFile(jPath)))); - await writeFile(aPath, arrowTable.serialize(encoding, stream), encoding); -})))().catch((e) => { console.error(e); process.exit(1); }); + await Promise.all(jsonPaths.map(async (path, i) => { + + const RecordBatchWriter = argv.format !== 'stream' + ? RecordBatchFileWriter + : RecordBatchStreamWriter; -function callResolved(fn) { - return async (path_, ...xs) => await fn(path.resolve(path_), ...xs); -} + const reader = RecordBatchReader.from(parse( + await fs.promises.readFile(Path.resolve(path), 'utf8'))); + + const jsonToArrow = reader + .pipe(RecordBatchWriter.throughNode()) + .pipe(fs.createWriteStream(arrowPaths[i])); + + await eos(jsonToArrow); + + })); +})() +.then((x) => +x || 0, (e) => { + e && process.stderr.write(`${e}`); + return process.exitCode || 1; +}).then((code = 0) => process.exit(code)); function cliOpts() { return [ @@ -95,5 +104,5 @@ function print_usage() { ] }, ])); - process.exit(1); + return 1; } diff --git a/js/bin/print-buffer-alignment.js b/js/bin/print-buffer-alignment.js index a4cd9bb2351e7..8d422aad60d74 100755 --- a/js/bin/print-buffer-alignment.js +++ b/js/bin/print-buffer-alignment.js @@ -17,34 +17,41 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); const path = require('path'); - -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const base = process.env.ARROW_JS_DEBUG === 'src' ? '../src' : '../targets/apache-arrow'; -const { Message } = require(`${base}/ipc/metadata${ext}`); -const { readBuffersAsync } = require(`${base}/ipc/reader/binary${ext}`); -const { Table, VectorVisitor, fromReadableStream } = require(`../index${ext}`); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; +const { AsyncMessageReader } = require(`../index${extension}`); (async () => { - const in_ = process.argv.length < 3 - ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); - - let recordBatchIndex = 0; - let dictionaryBatchIndex = 0; - - for await (let { message, loader } of readBuffersAsync(fromReadableStream(in_))) { - - if (Message.isRecordBatch(message)) { - console.log(`record batch ${++recordBatchIndex}, offset ${loader.messageOffset}`); - } else if (Message.isDictionaryBatch(message)) { - message = message.data; - console.log(`dictionary batch ${++dictionaryBatchIndex}, offset ${loader.messageOffset}`); - } else { continue; } - - message.buffers.forEach(({offset, length}, i) => { - console.log(`\tbuffer ${i+1}: { offset: ${offset}, length: ${length} }`); + + const readable = process.argv.length < 3 ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); + const reader = new AsyncMessageReader(readable); + + let recordBatchIndex = 0, dictionaryBatchIndex = 0; + + for await (let message of reader) { + + let bufferRegions = []; + + if (message.isSchema()) { + continue; + } else if (message.isRecordBatch()) { + bufferRegions = message.header().buffers; + const body = await reader.readMessageBody(message.bodyLength); + console.log(`record batch ${++recordBatchIndex}, byteOffset ${body.byteOffset}`); + } else if (message.isDictionaryBatch()) { + bufferRegions = message.header().data.buffers; + const body = await reader.readMessageBody(message.bodyLength); + console.log(`dictionary batch ${++dictionaryBatchIndex}, byteOffset ${body.byteOffset}`); + } + + bufferRegions.forEach(({ offset, length }, i) => { + console.log(`\tbuffer ${i + 1}: { offset: ${offset}, length: ${length} }`); }); } + await reader.return(); + })().catch((e) => { console.error(e); process.exit(1); }); diff --git a/js/bin/stream-to-file.js b/js/bin/stream-to-file.js index f33646ac61a41..015a5eace74d8 100755 --- a/js/bin/stream-to-file.js +++ b/js/bin/stream-to-file.js @@ -17,21 +17,24 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); const path = require('path'); - -const encoding = 'binary'; -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const { util: { PipeIterator } } = require(`../index${ext}`); -const { Table, serializeFile, fromReadableStream } = require(`../index${ext}`); +const eos = require('util').promisify(require('stream').finished); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; +const { RecordBatchReader, RecordBatchFileWriter } = require(`../index${extension}`); (async () => { - // Todo (ptaylor): implement `serializeFileAsync` that accepts an - // AsyncIterable, rather than aggregating into a Table first - const in_ = process.argv.length < 3 - ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); - const out = process.argv.length < 4 - ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); - new PipeIterator(serializeFile(await Table.fromAsync(fromReadableStream(in_))), encoding).pipe(out); + + const readable = process.argv.length < 3 ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); + const writable = process.argv.length < 4 ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); + + const streamToFile = readable + .pipe(RecordBatchReader.throughNode()) + .pipe(RecordBatchFileWriter.throughNode()) + .pipe(writable); + + await eos(streamToFile); })().catch((e) => { console.error(e); process.exit(1); }); diff --git a/js/examples/read_file.html b/js/examples/read_file.html index 3e082d9dc412f..ec96d0e4755e2 100644 --- a/js/examples/read_file.html +++ b/js/examples/read_file.html @@ -86,6 +86,6 @@
- + diff --git a/js/gulp/argv.js b/js/gulp/argv.js index 7dceb0f74c587..3a028f813f936 100644 --- a/js/gulp/argv.js +++ b/js/gulp/argv.js @@ -21,16 +21,12 @@ const path = require('path'); const argv = require(`command-line-args`)([ { name: `all`, type: Boolean }, - { name: 'update', alias: 'u', type: Boolean }, - { name: 'verbose', alias: 'v', type: Boolean }, + { name: 'verbose', alias: `v`, type: Boolean }, { name: `target`, type: String, defaultValue: `` }, { name: `module`, type: String, defaultValue: `` }, { name: `coverage`, type: Boolean, defaultValue: false }, - { name: `integration`, alias: `i`, type: Boolean, defaultValue: false }, { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] }, - { name: `json_files`, alias: `j`, type: String, multiple: true, defaultValue: [] }, - { name: `arrow_files`, alias: `a`, type: String, multiple: true, defaultValue: [] }, ], { partial: true }); const { targets, modules } = argv; @@ -44,25 +40,4 @@ if (argv.target === `src`) { (argv.all || !modules.length) && modules.push(`all`); } -if (argv.coverage && (!argv.json_files || !argv.json_files.length)) { - - let [jsonPaths, arrowPaths] = glob - .sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)) - .reduce((paths, jsonPath) => { - const { name } = path.parse(jsonPath); - const [jsonPaths, arrowPaths] = paths; - ['cpp', 'java'].forEach((source) => ['file', 'stream'].forEach((format) => { - const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); - if (fs.existsSync(arrowPath)) { - jsonPaths.push(jsonPath); - arrowPaths.push(arrowPath); - } - })); - return paths; - }, [[], []]); - - argv.json_files = jsonPaths; - argv.arrow_files = arrowPaths; -} - module.exports = { argv, targets, modules }; diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js index 95fc1eed0f84e..e119c540dc351 100644 --- a/js/gulp/arrow-task.js +++ b/js/gulp/arrow-task.js @@ -16,24 +16,22 @@ // under the License. const { - mainExport, gCCLanguageNames, targetDir, observableFromStreams } = require('./util'); const del = require('del'); const gulp = require('gulp'); -const path = require('path'); const { promisify } = require('util'); const gulpRename = require(`gulp-rename`); const { memoizeTask } = require('./memoize-task'); const exec = promisify(require('child_process').exec); const { Observable, ReplaySubject } = require('rxjs'); -const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, format) { +const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target) { const out = targetDir(target); const dtsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.ts`; const cjsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.js`; - const esmGlob = `${targetDir(`es2015`, `esm`)}/**/*.js`; + const esmGlob = `${targetDir(`esnext`, `esm`)}/**/*.js`; const es5UmdGlob = `${targetDir(`es5`, `umd`)}/*.js`; const es5UmdMaps = `${targetDir(`es5`, `umd`)}/*.map`; const es2015UmdGlob = `${targetDir(`es2015`, `umd`)}/*.js`; @@ -46,7 +44,7 @@ const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, forma observableFromStreams(gulp.src(esmGlob), ch_ext(`.mjs`), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs` observableFromStreams(gulp.src(es5UmdGlob), append(`.es5.min`), gulp.dest(out)), // copy es5 umd files and add `.min` observableFromStreams(gulp.src(es5UmdMaps), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename - observableFromStreams(gulp.src(es2015UmdGlob), append(`.es2015.min`), gulp.dest(out)), // copy es2015 umd files and add `.es6.min` + observableFromStreams(gulp.src(es2015UmdGlob), append(`.es2015.min`), gulp.dest(out)), // copy es2015 umd files and add `.es2015.min` observableFromStreams(gulp.src(es2015UmdMaps), gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename ).publish(new ReplaySubject()).refCount(); }))({}); @@ -61,4 +59,4 @@ const arrowTSTask = ((cache) => memoizeTask(cache, async function copyTS(target, module.exports = arrowTask; module.exports.arrowTask = arrowTask; -module.exports.arrowTSTask = arrowTSTask; \ No newline at end of file +module.exports.arrowTSTask = arrowTSTask; diff --git a/js/gulp/clean-task.js b/js/gulp/clean-task.js index d6c90f4637c8b..551aeb41af739 100644 --- a/js/gulp/clean-task.js +++ b/js/gulp/clean-task.js @@ -16,16 +16,15 @@ // under the License. const del = require('del'); +const { Observable } = require('rxjs'); const { targetDir } = require('./util'); -const { memoizeTask } = require('./memoize-task'); -const { Observable, ReplaySubject } = require('rxjs'); +const memoizeTask = require('./memoize-task'); const cleanTask = ((cache) => memoizeTask(cache, function clean(target, format) { - return Observable - .from(del(`${targetDir(target, format)}/**`)) - .catch((e) => Observable.empty()) - .multicast(new ReplaySubject()).refCount(); + const dir = targetDir(target, format); + return Observable.from(del(dir)) + .catch((e) => Observable.empty()); }))({}); module.exports = cleanTask; -module.exports.cleanTask = cleanTask; \ No newline at end of file +module.exports.cleanTask = cleanTask; diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js index 547e760a7fa8a..ef629982ae39f 100644 --- a/js/gulp/closure-task.js +++ b/js/gulp/closure-task.js @@ -18,52 +18,83 @@ const { targetDir, mainExport, + esmRequire, gCCLanguageNames, - UMDSourceTargets, - observableFromStreams + publicModulePaths, + observableFromStreams, + shouldRunInChildProcess, + spawnGulpCommandInChildProcess, } = require('./util'); +const fs = require('fs'); const gulp = require('gulp'); const path = require('path'); const sourcemaps = require('gulp-sourcemaps'); const { memoizeTask } = require('./memoize-task'); const { compileBinFiles } = require('./typescript-task'); -const { Observable, ReplaySubject } = require('rxjs'); +const mkdirp = require('util').promisify(require('mkdirp')); const closureCompiler = require('google-closure-compiler').gulp(); -const closureTask = ((cache) => memoizeTask(cache, function closure(target, format) { +const closureTask = ((cache) => memoizeTask(cache, async function closure(target, format) { + + if (shouldRunInChildProcess(target, format)) { + return spawnGulpCommandInChildProcess('compile', target, format); + } + const src = targetDir(target, `cls`); + const srcAbsolute = path.resolve(src); const out = targetDir(target, format); - const entry = path.join(src, mainExport); - const externs = path.join(`src/Arrow.externs.js`); - return observableFromStreams( - gulp.src([ -/* external libs first --> */ `node_modules/tslib/package.json`, - `node_modules/tslib/tslib.es6.js`, - `node_modules/flatbuffers/package.json`, - `node_modules/flatbuffers/js/flatbuffers.mjs`, - `node_modules/text-encoding-utf-8/package.json`, - `node_modules/text-encoding-utf-8/src/encoding.js`, -/* then sources globs --> */ `${src}/**/*.js`, - ], { base: `./` }), - sourcemaps.init(), - closureCompiler(createClosureArgs(entry, externs)), - // rename the sourcemaps from *.js.map files to *.min.js.map - sourcemaps.write(`.`, { mapFile: (mapPath) => mapPath.replace(`.js.map`, `.${target}.min.js.map`) }), - gulp.dest(out) - ) - .merge(compileBinFiles(target, format)) - .takeLast(1) - .publish(new ReplaySubject()).refCount(); + const externs = path.join(`${out}/${mainExport}.externs.js`); + const entry_point = path.join(`${src}/${mainExport}.dom.cls.js`); + + const exportedImports = publicModulePaths(srcAbsolute).reduce((entries, publicModulePath) => [ + ...entries, { + publicModulePath, + exports_: getPublicExportedNames(esmRequire(publicModulePath, { warnings: false })) + } + ], []); + + await mkdirp(out); + + await Promise.all([ + fs.promises.writeFile(externs, generateExternsFile(exportedImports)), + fs.promises.writeFile(entry_point, generateUMDExportAssignnent(srcAbsolute, exportedImports)) + ]); + + return await Promise.all([ + runClosureCompileAsObservable().toPromise(), + compileBinFiles(target, format).toPromise() + ]); + + function runClosureCompileAsObservable() { + return observableFromStreams( + gulp.src([ + /* external libs first */ + `node_modules/flatbuffers/package.json`, + `node_modules/flatbuffers/js/flatbuffers.mjs`, + `node_modules/text-encoding-utf-8/package.json`, + `node_modules/text-encoding-utf-8/src/encoding.js`, + `${src}/**/*.js` /* <-- then source globs */ + ], { base: `./` }), + sourcemaps.init(), + closureCompiler(createClosureArgs(entry_point, externs)), + // rename the sourcemaps from *.js.map files to *.min.js.map + sourcemaps.write(`.`, { mapFile: (mapPath) => mapPath.replace(`.js.map`, `.${target}.min.js.map`) }), + gulp.dest(out) + ); + } }))({}); -const createClosureArgs = (entry, externs) => ({ +module.exports = closureTask; +module.exports.closureTask = closureTask; + +const createClosureArgs = (entry_point, externs) => ({ externs, + entry_point, third_party: true, warning_level: `QUIET`, dependency_mode: `STRICT`, rewrite_polyfills: false, - entry_point: `${entry}.js`, module_resolution: `NODE`, // formatting: `PRETTY_PRINT`, // debug: true, @@ -72,10 +103,99 @@ const createClosureArgs = (entry, externs) => ({ package_json_entry_names: `module,jsnext:main,main`, assume_function_wrapper: true, js_output_file: `${mainExport}.js`, - language_in: gCCLanguageNames[`es2015`], + language_in: gCCLanguageNames[`esnext`], language_out: gCCLanguageNames[`es5`], - output_wrapper: -`// Licensed to the Apache Software Foundation (ASF) under one + output_wrapper:`${apacheHeader()} +(function (global, factory) { + typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : + typeof define === 'function' && define.amd ? define(['Arrow'], factory) : + (factory(global.Arrow = global.Arrow || {})); +}(this, (function (exports) {%output%}.bind(this))));` +}); + +function generateUMDExportAssignnent(src, exportedImports) { + return [ + ...exportedImports.map(({ publicModulePath }, i) => { + const p = publicModulePath.slice(src.length + 1); + return (`import * as exports${i} from './${p}';`); + }).filter(Boolean), + 'Object.assign(arguments[0], exports0);' + ].join('\n'); +} + +function generateExternsFile(exportedImports) { + return [ + externsHeader(), + ...exportedImports.reduce((externBodies, { exports_ }) => [ + ...externBodies, ...exports_.map(externBody) + ], []).filter(Boolean) + ].join('\n'); +} + +function externBody({ exportName, staticNames, instanceNames }) { + return [ + `var ${exportName} = function() {};`, + staticNames.map((staticName) => (isNaN(+staticName) + ? `/** @type {?} */\n${exportName}.${staticName} = function() {};` + : `/** @type {?} */\n${exportName}[${staticName}] = function() {};` + )).join('\n'), + instanceNames.map((instanceName) => (isNaN(+instanceName) + ? `/** @type {?} */\n${exportName}.prototype.${instanceName};` + : `/** @type {?} */\n${exportName}.prototype[${instanceName}];` + )).join('\n') + ].filter(Boolean).join('\n'); +} + +function externsHeader() { + return (`${apacheHeader()} +// @ts-nocheck +/* tslint:disable */ +/** + * @fileoverview Closure Compiler externs for Arrow + * @externs + * @suppress {duplicate,checkTypes} + */ +/** @type {symbol} */ +Symbol.iterator; +/** @type {symbol} */ +Symbol.toPrimitive; +/** @type {symbol} */ +Symbol.asyncIterator; +`); +} + +function getPublicExportedNames(entryModule) { + const fn = function() {}; + const isStaticOrProtoName = (x) => ( + !(x in fn) && + (x !== `default`) && + (x !== `undefined`) && + (x !== `__esModule`) && + (x !== `constructor`) && + !(x.startsWith('_')) + ); + return Object + .getOwnPropertyNames(entryModule) + .filter((name) => name !== 'default') + .filter((name) => ( + typeof entryModule[name] === `object` || + typeof entryModule[name] === `function` + )) + .map((name) => [name, entryModule[name]]) + .reduce((reserved, [name, value]) => { + + const staticNames = value && + typeof value === 'object' ? Object.getOwnPropertyNames(value).filter(isStaticOrProtoName) : + typeof value === 'function' ? Object.getOwnPropertyNames(value).filter(isStaticOrProtoName) : []; + + const instanceNames = (typeof value === `function` && Object.getOwnPropertyNames(value.prototype || {}) || []).filter(isStaticOrProtoName); + + return [...reserved, { exportName: name, staticNames, instanceNames }]; + }, []); +} + +function apacheHeader() { + return `// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file @@ -90,13 +210,5 @@ const createClosureArgs = (entry, externs) => ({ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations -// under the License. -(function (global, factory) { - typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : - typeof define === 'function' && define.amd ? define(['exports'], factory) : - (factory(global.Arrow = global.Arrow || {})); -}(this, (function (exports) {%output%}.bind(this))));` -}); - -module.exports = closureTask; -module.exports.closureTask = closureTask; +// under the License.` +} diff --git a/js/gulp/build-task.js b/js/gulp/compile-task.js similarity index 90% rename from js/gulp/build-task.js rename to js/gulp/compile-task.js index 9f3402cdd3508..60e2ebbe36a93 100644 --- a/js/gulp/build-task.js +++ b/js/gulp/compile-task.js @@ -24,7 +24,7 @@ const closureTask = require('./closure-task'); const typescriptTask = require('./typescript-task'); const { arrowTask, arrowTSTask } = require('./arrow-task'); -const buildTask = ((cache) => memoizeTask(cache, function build(target, format, ...args) { +const compileTask = ((cache) => memoizeTask(cache, function compile(target, format, ...args) { return target === `src` ? Observable.empty() : target === npmPkgName ? arrowTask(target, format, ...args)() : target === `ts` ? arrowTSTask(target, format, ...args)() @@ -33,5 +33,5 @@ const buildTask = ((cache) => memoizeTask(cache, function build(target, format, : typescriptTask(target, format, ...args)(); }))({}); -module.exports = buildTask; -module.exports.buildTask = buildTask; +module.exports = compileTask; +module.exports.compileTask = compileTask; diff --git a/js/gulp/memoize-task.js b/js/gulp/memoize-task.js index 0b0fc843c451a..408ee3b8839db 100644 --- a/js/gulp/memoize-task.js +++ b/js/gulp/memoize-task.js @@ -17,6 +17,13 @@ const { taskName } = require('./util'); +const createTask = ((taskFn) => ((target, format, ...args) => { + // Give the memoized fn a displayName so gulp's output is easier to follow. + const fn = () => taskFn(target, format, ...args); + fn.displayName = `${taskFn.name || ``}:${taskName(target, format, ...args)}:task`; + return fn; +})); + const memoizeTask = ((cache, taskFn) => ((target, format, ...args) => { // Give the memoized fn a displayName so gulp's output is easier to follow. const fn = () => ( @@ -27,4 +34,5 @@ const memoizeTask = ((cache, taskFn) => ((target, format, ...args) => { })); module.exports = memoizeTask; -module.exports.memoizeTask = memoizeTask; \ No newline at end of file +module.exports.createTask = createTask; +module.exports.memoizeTask = memoizeTask; diff --git a/js/gulp/minify-task.js b/js/gulp/minify-task.js index 82145aa90861a..81cb5e5f3f536 100644 --- a/js/gulp/minify-task.js +++ b/js/gulp/minify-task.js @@ -18,10 +18,10 @@ const { targetDir, mainExport, - ESKeywords, UMDSourceTargets, terserLanguageNames, - observableFromStreams + shouldRunInChildProcess, + spawnGulpCommandInChildProcess, } = require('./util'); const path = require('path'); @@ -30,41 +30,24 @@ const { memoizeTask } = require('./memoize-task'); const { compileBinFiles } = require('./typescript-task'); const { Observable, ReplaySubject } = require('rxjs'); const TerserPlugin = require(`terser-webpack-plugin`); -const esmRequire = require(`@std/esm`)(module, { - mode: `js`, - warnings: false, - cjs: { - /* A boolean for storing ES modules in require.cache. */ - cache: true, - /* A boolean for respecting require.extensions in ESM. */ - extensions: true, - /* A boolean for __esModule interoperability. */ - interop: true, - /* A boolean for importing named exports of CJS modules. */ - namedExports: true, - /* A boolean for following CJS path rules in ESM. */ - paths: true, - /* A boolean for __dirname, __filename, and require in ESM. */ - vars: true, - } -}); const minifyTask = ((cache, commonConfig) => memoizeTask(cache, function minifyJS(target, format) { + if (shouldRunInChildProcess(target, format)) { + return spawnGulpCommandInChildProcess('compile', target, format); + } + const sourceTarget = UMDSourceTargets[target]; - const PublicNames = reservePublicNames(sourceTarget, `cls`); const out = targetDir(target, format), src = targetDir(sourceTarget, `cls`); const targetConfig = { ...commonConfig, output: { ...commonConfig.output, path: path.resolve(`./${out}`) } }; - const webpackConfigs = [ - [mainExport, PublicNames] - ].map(([entry, reserved]) => ({ + const webpackConfigs = [mainExport].map((entry) => ({ ...targetConfig, name: entry, - entry: { [entry]: path.resolve(`${src}/${entry}.js`) }, + entry: { [entry]: path.resolve(`${src}/${entry}.dom.js`) }, plugins: [ ...(targetConfig.plugins || []), new webpack.SourceMapDevToolPlugin({ @@ -73,20 +56,23 @@ const minifyTask = ((cache, commonConfig) => memoizeTask(cache, function minifyJ resourcePath .replace(/\s/, `_`) .replace(/\.\/node_modules\//, ``) - }), - new TerserPlugin({ - sourceMap: true, - terserOptions: { - ecma: terserLanguageNames[target], - compress: { unsafe: true }, - output: { comments: false, beautify: false }, - mangle: { eval: true, - properties: { reserved, keep_quoted: true } - }, - safari10: true // <-- works around safari10 bugs, see the "safari10" option here: https://github.com/terser-js/terser#minify-options - }, }) - ] + ], + optimization: { + minimize: true, + minimizer: [ + new TerserPlugin({ + sourceMap: true, + terserOptions: { + ecma: terserLanguageNames[target], + output: { comments: false, beautify: false }, + compress: { unsafe: true }, + mangle: true, + safari10: true // <-- works around safari10 bugs, see the "safari10" option here: https://github.com/terser-js/terser#minify-options + }, + }) + ] + } })); const compilers = webpack(webpackConfigs); @@ -102,42 +88,3 @@ const minifyTask = ((cache, commonConfig) => memoizeTask(cache, function minifyJ module.exports = minifyTask; module.exports.minifyTask = minifyTask; - -const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, format) { - const src = targetDir(target, format); - const publicModulePaths = [ - `../${src}/data.js`, - `../${src}/type.js`, - `../${src}/table.js`, - `../${src}/vector.js`, - `../${src}/util/int.js`, - `../${src}/predicate.js`, - `../${src}/recordbatch.js`, - `../${src}/${mainExport}.js`, - ]; - return publicModulePaths.reduce((keywords, publicModulePath) => [ - ...keywords, ...reserveExportedNames(esmRequire(publicModulePath, { warnings: false })) - ], [...ESKeywords]); -})(ESKeywords); - -// Reflect on the Arrow modules to come up with a list of keys to save from -// Terser's -// mangler. Assume all the non-inherited static and prototype members of the Arrow -// module and its direct exports are public, and should be preserved through minification. -const reserveExportedNames = (entryModule) => ( - Object - .getOwnPropertyNames(entryModule) - .filter((name) => ( - typeof entryModule[name] === `object` || - typeof entryModule[name] === `function` - )) - .map((name) => [name, entryModule[name]]) - .reduce((reserved, [name, value]) => { - const fn = function() {}; - const ownKeys = value && typeof value === 'object' && Object.getOwnPropertyNames(value) || []; - const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype || {}) || []; - const publicNames = [...ownKeys, ...protoKeys].filter((x) => x !== `default` && x !== `undefined` && !(x in fn)); - return [...reserved, name, ...publicNames]; - }, [] - ) -); diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index 8c0f8fb0e4767..2a67c812206ce 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -46,17 +46,19 @@ const createMainPackageJson = (target, format) => (orig) => ({ ...createTypeScriptPackageJson(target, format)(orig), bin: orig.bin, name: npmPkgName, - main: mainExport, - types: `${mainExport}.d.ts`, - module: `${mainExport}.mjs`, + main: `${mainExport}.node`, + browser: `${mainExport}.dom`, + types: `${mainExport}.node.d.ts`, unpkg: `${mainExport}.es5.min.js`, - [`@std/esm`]: { mode: `all`, warnings: false, sourceMap: true } + [`esm`]: { mode: `all`, sourceMap: true } }); const createTypeScriptPackageJson = (target, format) => (orig) => ({ ...createScopedPackageJSON(target, format)(orig), - main: `${mainExport}.ts`, types: `${mainExport}.ts`, bin: undefined, + main: `${mainExport}.node.ts`, + types: `${mainExport}.node.ts`, + browser: `${mainExport}.dom.ts`, dependencies: { '@types/flatbuffers': '*', '@types/node': '*', @@ -70,8 +72,10 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), { name: `${npmOrgName}/${packageName(target, format)}`, - version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, - unpkg: undefined, module: undefined, [`@std/esm`]: undefined + browser: format === 'umd' ? undefined : `${mainExport}.dom`, + main: format === 'umd' ? `${mainExport}` : `${mainExport}.node`, + types: format === 'umd' ? undefined : `${mainExport}.node.d.ts`, + version: undefined, unpkg: undefined, module: undefined, [`esm`]: undefined, } ) ) @@ -80,6 +84,5 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( format !== `esm` && format !== `cls` ? packageJSON - : { ...packageJSON, [`@std/esm`]: { mode: `js`, warnings: false, sourceMap: true } } + : { ...packageJSON, [`esm`]: { mode: `auto`, sourceMap: true } } ); - \ No newline at end of file diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js index b0e34f8c94426..c7ad7d513c652 100644 --- a/js/gulp/test-task.js +++ b/js/gulp/test-task.js @@ -20,44 +20,47 @@ const path = require('path'); const { argv } = require('./argv'); const { promisify } = require('util'); const glob = promisify(require('glob')); -const stat = promisify(require('fs').stat); const mkdirp = promisify(require('mkdirp')); const rimraf = promisify(require('rimraf')); const child_process = require(`child_process`); const { memoizeTask } = require('./memoize-task'); const readFile = promisify(require('fs').readFile); +const asyncDone = promisify(require('async-done')); const exec = promisify(require('child_process').exec); const parseXML = promisify(require('xml2js').parseString); const jestArgv = []; -argv.update && jestArgv.push(`-u`); argv.verbose && jestArgv.push(`--verbose`); -argv.coverage && jestArgv.push(`--coverage`); +argv.coverage + ? jestArgv.push(`-c`, `jest.coverage.config.js`, `--coverage`) + : jestArgv.push(`-c`, `jest.config.js`, `-i`) -const debugArgv = [`--runInBand`, `--env`, `node-debug`]; -const jest = require.resolve(path.join(`..`, `node_modules`, `.bin`, `jest`)); +const jest = path.join(path.parse(require.resolve(`jest`)).dir, `../bin/jest.js`); const testOptions = { - env: { ...process.env }, stdio: [`ignore`, `inherit`, `inherit`], + env: { + ...process.env, + // hide fs.promises/stream[Symbol.asyncIterator] warnings + NODE_NO_WARNINGS: `1`, + // prevent the user-land `readable-stream` module from + // patching node's streams -- they're better now + READABLE_STREAM: `disable` + }, }; -const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format, debug = false) { +const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format) { const opts = { ...testOptions }; - const args = !debug ? [...execArgv] : [...debugArgv, ...execArgv]; - if (!argv.coverage) { - args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); - } - opts.env = { ...opts.env, + const args = [...execArgv, `test/unit/`]; + opts.env = { + ...opts.env, TEST_TARGET: target, TEST_MODULE: format, - TEST_TS_SOURCE: !!argv.coverage || (target === 'src') || (opts.env.TEST_TS_SOURCE === 'true'), - JSON_PATHS: JSON.stringify(Array.isArray(argv.json_files) ? argv.json_files : [argv.json_files]), - ARROW_PATHS: JSON.stringify(Array.isArray(argv.arrow_files) ? argv.arrow_files : [argv.arrow_files]), + TEST_DOM_STREAMS: (target ==='src' || format === 'umd').toString(), + TEST_NODE_STREAMS: (target ==='src' || format !== 'umd').toString(), + TEST_TS_SOURCE: !!argv.coverage || (target === 'src') || (opts.env.TEST_TS_SOURCE === 'true') }; - return !debug ? - child_process.spawn(jest, args, opts) : - child_process.exec(`node --inspect-brk ${jest} ${args.join(` `)}`, opts); -}))({}, jestArgv, testOptions); + return asyncDone(() => child_process.spawn(`node`, args, opts)); +}))({}, [jest, ...jestArgv], testOptions); module.exports = testTask; module.exports.testTask = testTask; @@ -69,9 +72,9 @@ const ARROW_HOME = process.env.ARROW_HOME || path.resolve('../'); const ARROW_JAVA_DIR = process.env.ARROW_JAVA_DIR || path.join(ARROW_HOME, 'java'); const CPP_EXE_PATH = process.env.ARROW_CPP_EXE_PATH || path.join(ARROW_HOME, 'cpp/build/debug'); const ARROW_INTEGRATION_DIR = process.env.ARROW_INTEGRATION_DIR || path.join(ARROW_HOME, 'integration'); -const CPP_JSON_TO_ARROW = path.join(CPP_EXE_PATH, 'json-integration-test'); -const CPP_STREAM_TO_FILE = path.join(CPP_EXE_PATH, 'stream-to-file'); -const CPP_FILE_TO_STREAM = path.join(CPP_EXE_PATH, 'file-to-stream'); +const CPP_JSON_TO_ARROW = path.join(CPP_EXE_PATH, 'arrow-json-integration-test'); +const CPP_STREAM_TO_FILE = path.join(CPP_EXE_PATH, 'arrow-stream-to-file'); +const CPP_FILE_TO_STREAM = path.join(CPP_EXE_PATH, 'arrow-file-to-stream'); const testFilesDir = path.join(ARROW_HOME, 'js/test/data'); const snapshotsDir = path.join(ARROW_HOME, 'js/test/__snapshots__'); diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js index beffab8a08ce0..fe694cac860b3 100644 --- a/js/gulp/typescript-task.js +++ b/js/gulp/typescript-task.js @@ -16,19 +16,26 @@ // under the License. const { - targetDir, tsconfigName, observableFromStreams + targetDir, + tsconfigName, + observableFromStreams, + shouldRunInChildProcess, + spawnGulpCommandInChildProcess, } = require('./util'); -const del = require('del'); const gulp = require('gulp'); const path = require('path'); const ts = require(`gulp-typescript`); -const gulpRename = require(`gulp-rename`); const sourcemaps = require('gulp-sourcemaps'); const { memoizeTask } = require('./memoize-task'); const { Observable, ReplaySubject } = require('rxjs'); const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target, format) { + + if (shouldRunInChildProcess(target, format)) { + return spawnGulpCommandInChildProcess('compile', target, format); + } + const out = targetDir(target, format); const tsconfigPath = path.join(`tsconfig`, `tsconfig.${tsconfigName(target, format)}.json`); return compileTypescript(out, tsconfigPath) @@ -39,11 +46,11 @@ const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target function compileBinFiles(target, format) { const out = targetDir(target, format); const tsconfigPath = path.join(`tsconfig`, `tsconfig.${tsconfigName('bin', 'cjs')}.json`); - return compileTypescript(path.join(out, 'bin'), tsconfigPath); + return compileTypescript(path.join(out, 'bin'), tsconfigPath, { target }); } -function compileTypescript(out, tsconfigPath) { - const tsProject = ts.createProject(tsconfigPath, { typescript: require(`typescript`) }); +function compileTypescript(out, tsconfigPath, tsconfigOverrides) { + const tsProject = ts.createProject(tsconfigPath, { typescript: require(`typescript`), ...tsconfigOverrides }); const { stream: { js, dts } } = observableFromStreams( tsProject.src(), sourcemaps.init(), tsProject(ts.reporter.defaultReporter()) diff --git a/js/gulp/util.js b/js/gulp/util.js index 12d21b0e16be2..bd87684a1dc3d 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -17,8 +17,11 @@ const fs = require('fs'); const path = require(`path`); -const pump = require(`pump`); +const pump = require(`stream`).pipeline; +const child_process = require(`child_process`); +const { targets, modules } = require('./argv'); const { Observable, ReplaySubject } = require('rxjs'); +const asyncDone = require('util').promisify(require('async-done')); const mainExport = `Arrow`; const npmPkgName = `apache-arrow`; @@ -29,7 +32,7 @@ const knownTargets = [`es5`, `es2015`, `esnext`]; const knownModules = [`cjs`, `esm`, `cls`, `umd`]; const tasksToSkipPerTargetOrFormat = { src: { clean: true, build: true }, - cls: { test: true, integration: true } + cls: { test: true, package: true } }; const packageJSONFields = [ `version`, `license`, `description`, @@ -66,7 +69,7 @@ const UMDSourceTargets = { es2015: `es2015`, es2016: `es2015`, es2017: `es2015`, - esnext: `es2015` + esnext: `esnext` }; const terserLanguageNames = { @@ -109,12 +112,27 @@ function targetDir(target, format) { return path.join(releasesRootDir, ...(!format ? [target] : [target, format])); } -function logAndDie(e) { - if (e) { - process.exit(1); - } +function shouldRunInChildProcess(target, format) { + // If we're building more than one module/target, then yes run this task in a child process + if (targets.length > 1 || modules.length > 1) { return true; } + // If the target we're building *isn't* the target the gulp command was configured to run, then yes run that in a child process + if (targets[0] !== target || modules[0] !== format) { return true; } + // Otherwise no need -- either gulp was run for just one target, or we've been spawned as the child of a multi-target parent gulp + return false; +} + +const gulp = path.join(path.parse(require.resolve(`gulp`)).dir, `bin/gulp.js`); +function spawnGulpCommandInChildProcess(command, target, format) { + const args = [gulp, command, '-t', target, '-m', format, `--silent`]; + const opts = { + stdio: [`ignore`, `inherit`, `inherit`], + env: { ...process.env, NODE_NO_WARNINGS: `1` } + }; + return asyncDone(() => child_process.spawn(`node`, args, opts)) + .catch((e) => { throw { message: `${command}:${taskName(target, format)}` }; }); } +const logAndDie = (e) => { if (e) { process.exit(1); } }; function observableFromStreams(...streams) { if (streams.length <= 0) { return Observable.empty(); } const pumped = streams.length <= 1 ? streams[0] : pump(...streams, logAndDie); @@ -164,12 +182,37 @@ function* combinations(_targets, _modules) { } } +const publicModulePaths = (dir) => [ + `${dir}/${mainExport}.dom.js`, + `${dir}/util/int.js`, + `${dir}/compute/predicate.js`, +]; + +const esmRequire = require(`esm`)(module, { + mode: `auto`, + cjs: { + /* A boolean for storing ES modules in require.cache. */ + cache: true, + /* A boolean for respecting require.extensions in ESM. */ + extensions: true, + /* A boolean for __esModule interoperability. */ + interop: true, + /* A boolean for importing named exports of CJS modules. */ + namedExports: true, + /* A boolean for following CJS path rules in ESM. */ + paths: true, + /* A boolean for __dirname, __filename, and require in ESM. */ + vars: true, + } +}); + module.exports = { mainExport, npmPkgName, npmOrgName, metadataFiles, packageJSONFields, knownTargets, knownModules, tasksToSkipPerTargetOrFormat, - ESKeywords, gCCLanguageNames, UMDSourceTargets, terserLanguageNames, + gCCLanguageNames, UMDSourceTargets, terserLanguageNames, taskName, packageName, tsconfigName, targetDir, combinations, observableFromStreams, + ESKeywords, publicModulePaths, esmRequire, shouldRunInChildProcess, spawnGulpCommandInChildProcess }; diff --git a/js/gulpfile.js b/js/gulpfile.js index 78aaa17ddb8b4..37c1d187995d2 100644 --- a/js/gulpfile.js +++ b/js/gulpfile.js @@ -17,17 +17,15 @@ const del = require('del'); const gulp = require('gulp'); -const path = require('path'); const { Observable } = require('rxjs'); -const buildTask = require('./gulp/build-task'); const cleanTask = require('./gulp/clean-task'); +const compileTask = require('./gulp/compile-task'); const packageTask = require('./gulp/package-task'); const { targets, modules } = require('./gulp/argv'); const { testTask, createTestData, cleanTestData } = require('./gulp/test-task'); const { - targetDir, taskName, combinations, - knownTargets, + targetDir, knownTargets, npmPkgName, UMDSourceTargets, tasksToSkipPerTargetOrFormat } = require('./gulp/util'); @@ -36,63 +34,60 @@ for (const [target, format] of combinations([`all`], [`all`])) { const task = taskName(target, format); gulp.task(`clean:${task}`, cleanTask(target, format)); gulp.task( `test:${task}`, testTask(target, format)); - gulp.task(`debug:${task}`, testTask(target, format, true)); - gulp.task(`build:${task}`, gulp.series(`clean:${task}`, - buildTask(target, format), - packageTask(target, format))); + gulp.task(`compile:${task}`, compileTask(target, format)); + gulp.task(`package:${task}`, packageTask(target, format)); + gulp.task(`build:${task}`, gulp.series( + `clean:${task}`, `compile:${task}`, `package:${task}` + )); } // The UMD bundles build temporary es5/6/next targets via TS, // then run the TS source through either closure-compiler or // a minifier, so we special case that here. -knownTargets.forEach((target) => - gulp.task(`build:${target}:umd`, - gulp.series( - gulp.parallel( - cleanTask(target, `umd`), - cleanTask(UMDSourceTargets[target], `cls`) - ), - buildTask(UMDSourceTargets[target], `cls`), - buildTask(target, `umd`), packageTask(target, `umd`) - ) - ) -); +knownTargets.forEach((target) => { + const umd = taskName(target, `umd`); + const cls = taskName(UMDSourceTargets[target], `cls`); + gulp.task(`build:${umd}`, gulp.series( + `build:${cls}`, + `clean:${umd}`, `compile:${umd}`, `package:${umd}`, + function remove_closure_tmp_files() { + return del(targetDir(target, `cls`)) + } + )); +}); // The main "apache-arrow" module builds the es5/umd, es2015/cjs, // es2015/esm, and es2015/umd targets, then copies and renames the // compiled output into the apache-arrow folder gulp.task(`build:${npmPkgName}`, gulp.series( - cleanTask(npmPkgName), gulp.parallel( `build:${taskName(`es5`, `umd`)}`, `build:${taskName(`es2015`, `cjs`)}`, `build:${taskName(`es2015`, `esm`)}`, `build:${taskName(`es2015`, `umd`)}` ), - buildTask(npmPkgName), packageTask(npmPkgName) + `clean:${npmPkgName}`, + `compile:${npmPkgName}`, + `package:${npmPkgName}` ) ); - -function gulpConcurrent(tasks) { - return () => Observable.bindCallback((tasks, cb) => gulp.parallel(tasks)(cb))(tasks); -} - -const buildConcurrent = (tasks) => () => - gulpConcurrent(tasks)() - .concat(Observable - .defer(() => Observable - .merge(...knownTargets.map((target) => - del(`${targetDir(target, `cls`)}/**`))))); - +// And finally the global composite tasks gulp.task(`clean:testdata`, cleanTestData); gulp.task(`create:testdata`, createTestData); -gulp.task(`test`, gulp.series(getTasks(`test`))); -gulp.task(`debug`, gulp.series(getTasks(`debug`))); +gulp.task(`test`, gulpConcurrent(getTasks(`test`))); gulp.task(`clean`, gulp.parallel(getTasks(`clean`))); -gulp.task(`build`, buildConcurrent(getTasks(`build`))); -gulp.task(`default`, gulp.series(`build`, `test`)); +gulp.task(`build`, gulpConcurrent(getTasks(`build`))); +gulp.task(`compile`, gulpConcurrent(getTasks(`compile`))); +gulp.task(`package`, gulpConcurrent(getTasks(`package`))); +gulp.task(`default`, gulp.series(`clean`, `build`, `test`)); + +function gulpConcurrent(tasks) { + const numCPUs = Math.max(1, require('os').cpus().length * 0.75) | 0; + return () => Observable.from(tasks.map((task) => gulp.series(task))) + .flatMap((task) => Observable.bindNodeCallback(task)(), numCPUs); +} function getTasks(name) { const tasks = []; diff --git a/js/index.ts b/js/index.ts index 51b8676abbd9d..cfd64bbbe9730 100644 --- a/js/index.ts +++ b/js/index.ts @@ -15,4 +15,4 @@ // specific language governing permissions and limitations // under the License. -export * from './src/Arrow'; \ No newline at end of file +export * from './src/Arrow.node'; \ No newline at end of file diff --git a/js/jest.config.js b/js/jest.config.js new file mode 100644 index 0000000000000..55028d09f969e --- /dev/null +++ b/js/jest.config.js @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +module.exports = { + "verbose": false, + "reporters": [ + "jest-silent-reporter" + ], + "testEnvironment": "node", + "globals": { + "ts-jest": { + "diagnostics": false, + "tsConfig": "test/tsconfig.json" + } + }, + "roots": [ + "/test/" + ], + "moduleFileExtensions": [ + "js", + "ts", + "tsx" + ], + "coverageReporters": [ + "lcov" + ], + "coveragePathIgnorePatterns": [ + "fb\\/(File|Message|Schema|Tensor)\\.(js|ts)$", + "test\\/.*\\.(ts|tsx|js)$", + "/node_modules/" + ], + "transform": { + "^.+\\.jsx?$": "ts-jest", + "^.+\\.tsx?$": "ts-jest" + }, + "transformIgnorePatterns": [ + "/node_modules/(?!web-stream-tools).+\\.js$" + ], + "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$", + "preset": "ts-jest", + "testMatch": null +}; diff --git a/js/jest.coverage.config.js b/js/jest.coverage.config.js new file mode 100644 index 0000000000000..72ddd3c9345a0 --- /dev/null +++ b/js/jest.coverage.config.js @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +module.exports = { + ...require('./jest.config'), + "reporters": undefined, + "coverageReporters": [ + "lcov", "json" + ], + "globals": { + "ts-jest": { + "diagnostics": false, + "tsConfig": "test/tsconfig.coverage.json" + } + } +}; diff --git a/js/npm-release.sh b/js/npm-release.sh index 3ef24d3e6f828..a52e25ed7884a 100755 --- a/js/npm-release.sh +++ b/js/npm-release.sh @@ -20,11 +20,7 @@ set -e # validate the targets pass all tests before publishing npm install -# npx run-s clean:all lint create:testdata build -# npm run test -- -t ts -u --integration -# npm run test -- --integration -npx run-s clean:all lint build -npm run test +npx gulp # publish the JS target modules to npm npx lerna exec -- npm publish diff --git a/js/package-lock.json b/js/package-lock.json index ef38db9a7468d..1dc65df427e9b 100644 --- a/js/package-lock.json +++ b/js/package-lock.json @@ -64,57 +64,56 @@ } }, "@lerna/add": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/add/-/add-3.5.0.tgz", - "integrity": "sha512-hoOqtal/ChEEtt9rxR/6xmyvTN7581XF4kWHoWPV9NbfZN9e8uTR8z4mCcJq2DiZhRuY7aA5FEROEbl12soowQ==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/add/-/add-3.10.6.tgz", + "integrity": "sha512-FxQ5Bmyb5fF+3BQiNffM6cTeGCrl4uaAuGvxFIWF6Pgz6U14tUc1e16xgKDvVb1CurzJgIV5sLOT5xmCOqv1kA==", "dev": true, "requires": { - "@lerna/bootstrap": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/bootstrap": "3.10.6", + "@lerna/command": "3.10.6", + "@lerna/filter-options": "3.10.6", + "@lerna/npm-conf": "3.7.0", + "@lerna/validation-error": "3.6.0", "dedent": "^0.7.0", - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", "p-map": "^1.2.0", - "pacote": "^9.1.0", "semver": "^5.5.0" } }, "@lerna/batch-packages": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@lerna/batch-packages/-/batch-packages-3.1.2.tgz", - "integrity": "sha512-HAkpptrYeUVlBYbLScXgeCgk6BsNVXxDd53HVWgzzTWpXV4MHpbpeKrByyt7viXlNhW0w73jJbipb/QlFsHIhQ==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/batch-packages/-/batch-packages-3.10.6.tgz", + "integrity": "sha512-sInr3ZQJFMh9Zq+ZUoVjX8R67j9ViRkVy0uEMsOfG+jZlXj1lRPRMPRiRgU0jXSYEwCdwuAB5pTd9tTx0VCJUw==", "dev": true, "requires": { - "@lerna/package-graph": "^3.1.2", - "@lerna/validation-error": "^3.0.0", - "npmlog": "^4.1.2" + "@lerna/package-graph": "3.10.6", + "@lerna/validation-error": "3.6.0", + "libnpm": "^2.0.1" } }, "@lerna/bootstrap": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/bootstrap/-/bootstrap-3.5.0.tgz", - "integrity": "sha512-+z4kVVJFO5EGfC2ob/4C9LetqWwDtbhZgTRllr1+zOi/2clbD+WKcVI0ku+/ckzKjz783SOc83swX7RrmiLwMQ==", - "dev": true, - "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/has-npm-version": "^3.3.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/npm-install": "^3.3.0", - "@lerna/rimraf-dir": "^3.3.0", - "@lerna/run-lifecycle": "^3.4.1", - "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/symlink-binary": "^3.3.0", - "@lerna/symlink-dependencies": "^3.3.0", - "@lerna/validation-error": "^3.0.0", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/bootstrap/-/bootstrap-3.10.6.tgz", + "integrity": "sha512-qbGjAxRpV/eiI9CboUIpsPPGpSogs8mN2/iDaAUBTaWVFVz/YyU64nui84Gll0kbdaHOyPput+kk2S8NCSCCdg==", + "dev": true, + "requires": { + "@lerna/batch-packages": "3.10.6", + "@lerna/command": "3.10.6", + "@lerna/filter-options": "3.10.6", + "@lerna/has-npm-version": "3.10.0", + "@lerna/npm-install": "3.10.0", + "@lerna/package-graph": "3.10.6", + "@lerna/pulse-till-done": "3.7.1", + "@lerna/rimraf-dir": "3.10.0", + "@lerna/run-lifecycle": "3.10.5", + "@lerna/run-parallel-batches": "3.0.0", + "@lerna/symlink-binary": "3.10.0", + "@lerna/symlink-dependencies": "3.10.0", + "@lerna/validation-error": "3.6.0", "dedent": "^0.7.0", "get-port": "^3.2.0", + "libnpm": "^2.0.1", "multimatch": "^2.1.0", - "npm-package-arg": "^6.0.0", - "npmlog": "^4.1.2", "p-finally": "^1.0.0", "p-map": "^1.2.0", "p-map-series": "^1.0.0", @@ -124,26 +123,26 @@ } }, "@lerna/changed": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/changed/-/changed-3.5.0.tgz", - "integrity": "sha512-p9o7/hXwFAoet7UPeHIzIPonYxLHZe9bcNcjxKztZYAne5/OgmZiF4X1UPL2S12wtkT77WQy4Oz8NjRTczcapg==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/changed/-/changed-3.10.6.tgz", + "integrity": "sha512-nZDVq/sKdhgoAg1BVnpqjqUUz5+zedG+AnU+6mjEN2f23YVtRCsW55N4I9eEdW2pxXUaCY85Hj/HPSA74BYaFg==", "dev": true, "requires": { - "@lerna/collect-updates": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/listable": "^3.0.0", - "@lerna/output": "^3.0.0", - "@lerna/version": "^3.5.0" + "@lerna/collect-updates": "3.10.1", + "@lerna/command": "3.10.6", + "@lerna/listable": "3.10.6", + "@lerna/output": "3.6.0", + "@lerna/version": "3.10.6" } }, "@lerna/check-working-tree": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/check-working-tree/-/check-working-tree-3.5.0.tgz", - "integrity": "sha512-aWeIputHddeZgf7/wA1e5yuv6q9S5si2y7fzO2Ah7m3KyDyl8XHP1M0VSSDzZeiloYCryAYQAoRgcrdH65Vhow==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/check-working-tree/-/check-working-tree-3.10.0.tgz", + "integrity": "sha512-NdIPhDgEtGHfeGjB9F0oAoPLywgMpjnJhLLwTNQkelDHo2xNAVpG8kV+A2UJ+cU5UXCZA4RZFxKNmw86rO+Drw==", "dev": true, "requires": { - "@lerna/describe-ref": "^3.5.0", - "@lerna/validation-error": "^3.0.0" + "@lerna/describe-ref": "3.10.0", + "@lerna/validation-error": "3.6.0" } }, "@lerna/child-process": { @@ -193,33 +192,44 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, "@lerna/clean": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/clean/-/clean-3.5.0.tgz", - "integrity": "sha512-bHUFF6Wv7ms81Tmwe56xk296oqU74Sg9NSkUCDG4kZLpYZx347Aw+89ZPTlaSmUwqCgEXKYLr65ZVVvKmflpcA==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/clean/-/clean-3.10.6.tgz", + "integrity": "sha512-MuL8HOwnyvVtr6GOiAN/Ofjbx+BJdCrtjrM1Uuh8FFnbnZTPVf+0MPxL2jVzPMo0PmoIrX3fvlwvzKNk/lH0Ug==", "dev": true, "requires": { - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/prompt": "^3.3.1", - "@lerna/rimraf-dir": "^3.3.0", + "@lerna/command": "3.10.6", + "@lerna/filter-options": "3.10.6", + "@lerna/prompt": "3.6.0", + "@lerna/pulse-till-done": "3.7.1", + "@lerna/rimraf-dir": "3.10.0", "p-map": "^1.2.0", "p-map-series": "^1.0.0", "p-waterfall": "^1.0.0" } }, "@lerna/cli": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/@lerna/cli/-/cli-3.2.0.tgz", - "integrity": "sha512-JdbLyTxHqxUlrkI+Ke+ltXbtyA+MPu9zR6kg/n8Fl6uaez/2fZWtReXzYi8MgLxfUFa7+1OHWJv4eAMZlByJ+Q==", + "version": "3.10.7", + "resolved": "https://registry.npmjs.org/@lerna/cli/-/cli-3.10.7.tgz", + "integrity": "sha512-yuoz/24mIfYit3neKqoE5NVs42Rj9A6A6SlkNPDfsy3v/Vh7SgYkU3cwiGyvwBGzIdhqL4/SWYo8H7YJLs0C+g==", "dev": true, "requires": { - "@lerna/global-options": "^3.1.3", + "@lerna/global-options": "3.10.6", "dedent": "^0.7.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "yargs": "^12.0.1" }, "dependencies": { @@ -260,13 +270,13 @@ } }, "execa": { - "version": "0.10.0", - "resolved": "https://registry.npmjs.org/execa/-/execa-0.10.0.tgz", - "integrity": "sha512-7XOMnz8Ynx1gGo/3hyV9loYNPWM94jG3+3T3Y8tsfSstFmETmENCMU/A/zj8Lyaj1lkgEepKepvd6240tBRvlw==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz", + "integrity": "sha512-adbxcyWV46qiHyvSp50TKt05tB4tK3HcmF7/nxfAdhnox83seTDbwnaqKO4sXRy7roHAIFqJP/Rw/AuEbX61LA==", "dev": true, "requires": { "cross-spawn": "^6.0.0", - "get-stream": "^3.0.0", + "get-stream": "^4.0.0", "is-stream": "^1.1.0", "npm-run-path": "^2.0.0", "p-finally": "^1.0.0", @@ -283,6 +293,15 @@ "locate-path": "^3.0.0" } }, + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", + "dev": true, + "requires": { + "pump": "^3.0.0" + } + }, "invert-kv": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/invert-kv/-/invert-kv-2.0.0.tgz", @@ -326,20 +345,20 @@ } }, "os-locale": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.0.1.tgz", - "integrity": "sha512-7g5e7dmXPtzcP4bgsZ8ixDVqA7oWYuEz4lOSujeWyliPai4gfVDiFIcwBg3aGCPnmSGfzOKTK3ccPn0CKv3DBw==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", + "integrity": "sha512-Z8l3R4wYWM40/52Z+S265okfFj8Kt2cC2MKY+xNi3kFs+XGI7WXu/I309QQQYbRW4ijiZ+yxs9pqEhJh0DqW3Q==", "dev": true, "requires": { - "execa": "^0.10.0", + "execa": "^1.0.0", "lcid": "^2.0.0", "mem": "^4.0.0" } }, "p-limit": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.0.0.tgz", - "integrity": "sha512-fl5s52lI5ahKCernzzIyAP0QAZbGIovtVHGwpcu1Jr/EpzLVDI2myISHwGqK7m8uQFugVWSrbxH7XnhGtvEc+A==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.1.0.tgz", + "integrity": "sha512-NhURkNcrVB+8hNfLuysU8enY5xn2KXphsHBaC2YmRNTZRc7RWusw6apSpdEj3jo4CMb6W9nrF6tTnsJsJeyu6g==", "dev": true, "requires": { "p-try": "^2.0.0" @@ -360,6 +379,22 @@ "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", "dev": true }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", @@ -379,6 +414,12 @@ "ansi-regex": "^3.0.0" } }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, "yargs": { "version": "12.0.5", "resolved": "https://registry.npmjs.org/yargs/-/yargs-12.0.5.tgz", @@ -412,34 +453,34 @@ } }, "@lerna/collect-updates": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/collect-updates/-/collect-updates-3.5.0.tgz", - "integrity": "sha512-rFCng14K8vHyrDJSAacj6ABKKT/TxZdpL9uPEtZN7DsoJKlKPzqFeRvRGA2+ed/I6mEm4ltauEjEpKG5O6xqtw==", + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/@lerna/collect-updates/-/collect-updates-3.10.1.tgz", + "integrity": "sha512-vb0wEJ8k63G+2CR/ud1WeVHNJ21Fs6Ew6lbdGZXnF4ZvaFWxWJZpoHeWwzjhMdJ75QdTzUaIhTG1hnH9faQNMw==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/describe-ref": "^3.5.0", + "@lerna/child-process": "3.3.0", + "@lerna/describe-ref": "3.10.0", + "libnpm": "^2.0.1", "minimatch": "^3.0.4", - "npmlog": "^4.1.2", "slash": "^1.0.0" } }, "@lerna/command": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/command/-/command-3.5.0.tgz", - "integrity": "sha512-C/0e7qPbuKZ9vEqzRePksoKDJk4TOWzsU5qaPP/ikqc6vClJbKucsIehk3za6glSjlgLCJpzBTF2lFjHfb+JNw==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/command/-/command-3.10.6.tgz", + "integrity": "sha512-jPZswMZXOpAaIuSF5hrz+eaWQzbDrvwbrkCoRJKfiAHx7URAkE6MQe9DeAnqrTKMqwfg0RciSrZLc8kWYfrzCQ==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/package-graph": "^3.1.2", - "@lerna/project": "^3.5.0", - "@lerna/validation-error": "^3.0.0", - "@lerna/write-log-file": "^3.0.0", + "@lerna/child-process": "3.3.0", + "@lerna/package-graph": "3.10.6", + "@lerna/project": "3.10.0", + "@lerna/validation-error": "3.6.0", + "@lerna/write-log-file": "3.6.0", "dedent": "^0.7.0", "execa": "^1.0.0", "is-ci": "^1.0.10", - "lodash": "^4.17.5", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1", + "lodash": "^4.17.5" }, "dependencies": { "cross-spawn": { @@ -478,23 +519,32 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, "@lerna/conventional-commits": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/conventional-commits/-/conventional-commits-3.5.0.tgz", - "integrity": "sha512-roKPILPYnDWiCDxOeBQ0cObJ2FbDgzJSToxr1ZwIqvJU5hGQ4RmooCf8GHcCW9maBJz7ETeestv8M2mBUgBPbg==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/conventional-commits/-/conventional-commits-3.10.0.tgz", + "integrity": "sha512-8FvO0eR8g/tEgkb6eRVYaD39TsqMKsOXp17EV48jciciEqcrF/d1Ypu6ilK1GDp6R/1m2mbjt/b52a/qrO+xaw==", "dev": true, "requires": { - "@lerna/validation-error": "^3.0.0", + "@lerna/validation-error": "3.6.0", "conventional-changelog-angular": "^5.0.2", "conventional-changelog-core": "^3.1.5", "conventional-recommended-bump": "^4.0.4", "fs-extra": "^7.0.0", "get-stream": "^4.0.0", - "npm-package-arg": "^6.0.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "semver": "^5.5.0" }, "dependencies": { @@ -506,25 +556,36 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, "@lerna/create": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/create/-/create-3.5.0.tgz", - "integrity": "sha512-ek4flHRmpMegZp9tP3RmuDhmMb9+/Hhy9B5eaZc5X5KWqDvFKJtn56sw+M9hNjiYehiimCwhaLWgE2WSikPvcQ==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/create/-/create-3.10.6.tgz", + "integrity": "sha512-OddQtGBHM2/eJONggLWoTE6275XGbnJ6dIVF+fLsKS93o4GC6g+qcc6Y7lUWHm5bfpeOwNOVKwj0tvqBZ6MgoA==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/child-process": "3.3.0", + "@lerna/command": "3.10.6", + "@lerna/npm-conf": "3.7.0", + "@lerna/validation-error": "3.6.0", "camelcase": "^4.1.0", "dedent": "^0.7.0", "fs-extra": "^7.0.0", "globby": "^8.0.1", "init-package-json": "^1.10.3", - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", + "p-reduce": "^1.0.0", "pify": "^3.0.0", "semver": "^5.5.0", "slash": "^1.0.0", @@ -540,13 +601,13 @@ "dev": true }, "globby": { - "version": "8.0.1", - "resolved": "https://registry.npmjs.org/globby/-/globby-8.0.1.tgz", - "integrity": "sha512-oMrYrJERnKBLXNLVTqhm3vPEdJ/b2ZE28xN4YARiix1NOIOBPEpOUnm844K1iu/BkphCaf2WNFwMszv8Soi1pw==", + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/globby/-/globby-8.0.2.tgz", + "integrity": "sha512-yTzMmKygLp8RUpG1Ymu2VXPSJQZjNAZPD4ywgYEaG7e4tBJeUQBO8OpXrf1RCNcEs5alsoJYPAMiIHP0cmeC7w==", "dev": true, "requires": { "array-union": "^1.0.1", - "dir-glob": "^2.0.0", + "dir-glob": "2.0.0", "fast-glob": "^2.0.2", "glob": "^7.1.2", "ignore": "^3.3.5", @@ -568,178 +629,214 @@ } }, "@lerna/create-symlink": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/create-symlink/-/create-symlink-3.3.0.tgz", - "integrity": "sha512-0lb88Nnq1c/GG+fwybuReOnw3+ah4dB81PuWwWwuqUNPE0n50qUf/M/7FfSb5JEh/93fcdbZI0La8t3iysNW1w==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/create-symlink/-/create-symlink-3.6.0.tgz", + "integrity": "sha512-YG3lTb6zylvmGqKU+QYA3ylSnoLn+FyLH5XZmUsD0i85R884+EyJJeHx/zUk+yrL2ZwHS4RBUgJfC24fqzgPoA==", "dev": true, "requires": { "cmd-shim": "^2.0.2", "fs-extra": "^7.0.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/describe-ref": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/describe-ref/-/describe-ref-3.5.0.tgz", - "integrity": "sha512-XvecK2PSwUv4z+otib5moWJMI+h3mtAg8nFlfo4KbivVtD/sI11jfKsr3S75HuAwhVAa8tAijoAxmuBJSsTE1g==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/describe-ref/-/describe-ref-3.10.0.tgz", + "integrity": "sha512-fouh3FQS07QxJJp/mW8LkGnH0xMRAzpBlejtZaiRwfDkW2kd6EuHaj8I/2/p21Wsprcvuu4dqmyia2YS1xFb/w==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "npmlog": "^4.1.2" + "@lerna/child-process": "3.3.0", + "libnpm": "^2.0.1" } }, "@lerna/diff": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/diff/-/diff-3.5.0.tgz", - "integrity": "sha512-iyZ0ZRPqH5Y5XEhOYoKS8H/8UXC/gZ/idlToMFHhUn1oTSd8v9HVU1c2xq1ge0u36ZH/fx/YydUk0A/KSv+p3Q==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/diff/-/diff-3.10.6.tgz", + "integrity": "sha512-0MqFhosjrqsIdXiKIu7t3CiJELqiU9mkjFBhYPB7JruAzpPwjMXJnC6/Ur5/7LXJYYVpqGQwZI9ZaZlOYJhhrw==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/validation-error": "^3.0.0", - "npmlog": "^4.1.2" + "@lerna/child-process": "3.3.0", + "@lerna/command": "3.10.6", + "@lerna/validation-error": "3.6.0", + "libnpm": "^2.0.1" } }, "@lerna/exec": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/exec/-/exec-3.5.0.tgz", - "integrity": "sha512-H5jeIueDiuNsxeuGKaP7HqTcenvMsFfBFeWr0W6knHv9NrOF8il34dBqYgApZEDSQ7+2fA3ghwWbF+jUGTSh/A==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/exec/-/exec-3.10.6.tgz", + "integrity": "sha512-cdHqaRBMYceJu8rZLO8b4ZeR27O+xKPHgzi13OOOfBJQjrTuacjMWyHgmpy8jWc/0f7QnTl4VsHks7VJ3UK+vw==", "dev": true, "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/validation-error": "^3.0.0" + "@lerna/batch-packages": "3.10.6", + "@lerna/child-process": "3.3.0", + "@lerna/command": "3.10.6", + "@lerna/filter-options": "3.10.6", + "@lerna/run-parallel-batches": "3.0.0", + "@lerna/validation-error": "3.6.0" } }, "@lerna/filter-options": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/filter-options/-/filter-options-3.5.0.tgz", - "integrity": "sha512-7pEQy1i5ynYOYjcSeo+Qaps4+Ais55RRdnT6/SLLBgyyHAMziflFLX5TnoyEaaXoU90iKfQ5z/ioEp6dFAXSMg==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/filter-options/-/filter-options-3.10.6.tgz", + "integrity": "sha512-r/dQbqN+RGFKZNn+DyWehswFmAkny/fkdMB2sRM2YVe7zRTtSl95YxD9DtdYnpJTG/jbOVICS/L5QJakrI6SSw==", "dev": true, "requires": { - "@lerna/collect-updates": "^3.5.0", - "@lerna/filter-packages": "^3.0.0", + "@lerna/collect-updates": "3.10.1", + "@lerna/filter-packages": "3.10.0", "dedent": "^0.7.0" } }, "@lerna/filter-packages": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/filter-packages/-/filter-packages-3.0.0.tgz", - "integrity": "sha512-zwbY1J4uRjWRZ/FgYbtVkq7I3Nduwsg2V2HwLKSzwV2vPglfGqgovYOVkND6/xqe2BHwDX4IyA2+e7OJmLaLSA==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/filter-packages/-/filter-packages-3.10.0.tgz", + "integrity": "sha512-3Acdj+jbany6LnQSuImU4ttcK5ULHSVug8Gh/EvwTewKCDpHAuoI3eyuzZOnSBdMvDOjE03uIESQK0dNNsn6Ow==", "dev": true, "requires": { - "@lerna/validation-error": "^3.0.0", - "multimatch": "^2.1.0", - "npmlog": "^4.1.2" + "@lerna/validation-error": "3.6.0", + "libnpm": "^2.0.1", + "multimatch": "^2.1.0" } }, "@lerna/get-npm-exec-opts": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/get-npm-exec-opts/-/get-npm-exec-opts-3.0.0.tgz", - "integrity": "sha512-arcYUm+4xS8J3Palhl+5rRJXnZnFHsLFKHBxznkPIxjwGQeAEw7df38uHdVjEQ+HNeFmHnBgSqfbxl1VIw5DHg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/get-npm-exec-opts/-/get-npm-exec-opts-3.6.0.tgz", + "integrity": "sha512-ruH6KuLlt75aCObXfUIdVJqmfVq7sgWGq5mXa05vc1MEqxTIiU23YiJdWzofQOOUOACaZkzZ4K4Nu7wXEg4Xgg==", + "dev": true, + "requires": { + "libnpm": "^2.0.1" + } + }, + "@lerna/get-packed": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/@lerna/get-packed/-/get-packed-3.7.0.tgz", + "integrity": "sha512-yuFtjsUZIHjeIvIYQ/QuytC+FQcHwo3peB+yGBST2uWCLUCR5rx6knoQcPzbxdFDCuUb5IFccFGd3B1fHFg3RQ==", "dev": true, "requires": { - "npmlog": "^4.1.2" + "fs-extra": "^7.0.0", + "ssri": "^6.0.1", + "tar": "^4.4.8" + }, + "dependencies": { + "tar": { + "version": "4.4.8", + "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", + "integrity": "sha512-LzHF64s5chPQQS0IYBn9IN5h3i98c12bo4NCO7e0sGM2llXQ3p2FGC5sdENN4cTW48O915Sh+x+EXx7XW96xYQ==", + "dev": true, + "requires": { + "chownr": "^1.1.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.3.4", + "minizlib": "^1.1.1", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.2", + "yallist": "^3.0.2" + } + }, + "yallist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", + "dev": true + } } }, "@lerna/global-options": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/@lerna/global-options/-/global-options-3.1.3.tgz", - "integrity": "sha512-LVeZU/Zgc0XkHdGMRYn+EmHfDmmYNwYRv3ta59iCVFXLVp7FRFWF7oB1ss/WRa9x/pYU0o6L8as/5DomLUGASA==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/global-options/-/global-options-3.10.6.tgz", + "integrity": "sha512-k5Xkq1M/uREFC2R9uwN5gcvIgjj4iOXo0YyeEXCMWBiW3j2GL9xN4d1MmAIcrYlAzVYh6kLlWaFWl/rNIneHIw==", "dev": true }, "@lerna/has-npm-version": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/has-npm-version/-/has-npm-version-3.3.0.tgz", - "integrity": "sha512-GX7omRep1eBRZHgjZLRw3MpBJSdA5gPZFz95P7rxhpvsiG384Tdrr/cKFMhm0A09yq27Tk/nuYTaZIj7HsVE6g==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/has-npm-version/-/has-npm-version-3.10.0.tgz", + "integrity": "sha512-N4RRYxGeivuaKgPDzrhkQOQs1Sg4tOnxnEe3akfqu1wDA4Ng5V6Y2uW3DbkAjFL3aNJhWF5Vbf7sBsGtfgDQ8w==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", + "@lerna/child-process": "3.3.0", "semver": "^5.5.0" } }, "@lerna/import": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/import/-/import-3.5.0.tgz", - "integrity": "sha512-vgI6lMEzd1ODgi75cmAlfPYylaK37WY3E2fwKyO/lj6UKSGj46dVSK0KwTRHx33tu4PLvPzFi5C6nbY57o5ykQ==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/import/-/import-3.10.6.tgz", + "integrity": "sha512-LlGxhfDhovoNoBJLF3PYd3j/G2GFTnfLh0V38+hBQ6lomMNJbjkACfiLVomQxPWWpYLk0GTlpWYR8YGv6L7Ifw==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/prompt": "^3.3.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/child-process": "3.3.0", + "@lerna/command": "3.10.6", + "@lerna/prompt": "3.6.0", + "@lerna/pulse-till-done": "3.7.1", + "@lerna/validation-error": "3.6.0", "dedent": "^0.7.0", "fs-extra": "^7.0.0", "p-map-series": "^1.0.0" } }, "@lerna/init": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/init/-/init-3.5.0.tgz", - "integrity": "sha512-V21/UWj34Mph+9NxIGH1kYcuJAp+uFjfG8Ku2nMy62OGL3553+YQ+Izr+R6egY8y/99UMCDpi5gkQni5eGv3MA==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/init/-/init-3.10.6.tgz", + "integrity": "sha512-RIlEx+ofWLYRNjxCkkV3G0XQPM+/KA5RXRDb5wKQLYO1f+tZAaHoUh8fHDIvxGf/ohY/OIjYYGSsU+ysimfwiQ==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", + "@lerna/child-process": "3.3.0", + "@lerna/command": "3.10.6", "fs-extra": "^7.0.0", "p-map": "^1.2.0", "write-json-file": "^2.3.0" } }, "@lerna/link": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/link/-/link-3.5.0.tgz", - "integrity": "sha512-KSu1mhxwNRmguqMqUTJd4c7QIk9/xmxJxbmMkA71OaJd4fwondob6DyI/B17NIWutdLbvSWQ7pRlFOPxjQVoUw==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/link/-/link-3.10.6.tgz", + "integrity": "sha512-dwD6qftRWitgLDYbqtDrgO7c8uF5C0fHVew5M6gU5m9tBJidqd7cDwHv/bXboLEI63U7tt5y6LY+wEpYUFsBRw==", "dev": true, "requires": { - "@lerna/command": "^3.5.0", - "@lerna/package-graph": "^3.1.2", - "@lerna/symlink-dependencies": "^3.3.0", + "@lerna/command": "3.10.6", + "@lerna/package-graph": "3.10.6", + "@lerna/symlink-dependencies": "3.10.0", "p-map": "^1.2.0", "slash": "^1.0.0" } }, "@lerna/list": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/list/-/list-3.5.0.tgz", - "integrity": "sha512-T+NZBQ/l6FmZklgrtFuN7luMs3AC/BoS52APOPrM7ZmxW4nenvov0xMwQW1783w/t365YDkDlYd5gM0nX3D1Hg==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/list/-/list-3.10.6.tgz", + "integrity": "sha512-3ElQBj2dOB4uUkpsjC1bxdeZwEzRBuV1pBBs5E1LncwsZf7D9D99Z32fuZsDaCHpEMgHAD4/j8juI3/7m5dkaQ==", "dev": true, "requires": { - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/listable": "^3.0.0", - "@lerna/output": "^3.0.0" + "@lerna/command": "3.10.6", + "@lerna/filter-options": "3.10.6", + "@lerna/listable": "3.10.6", + "@lerna/output": "3.6.0" } }, "@lerna/listable": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/listable/-/listable-3.0.0.tgz", - "integrity": "sha512-HX/9hyx1HLg2kpiKXIUc1EimlkK1T58aKQ7ovO7rQdTx9ForpefoMzyLnHE1n4XrUtEszcSWJIICJ/F898M6Ag==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/listable/-/listable-3.10.6.tgz", + "integrity": "sha512-F7ZuvesSgeuMiJf99eOum5p1MQGQStykcmHH1ek+LQRMiGGF1o3PkBxPvHTZBADGOFarek8bFA5TVmRAMX7NIw==", "dev": true, "requires": { + "@lerna/batch-packages": "3.10.6", "chalk": "^2.3.1", "columnify": "^1.5.4" } }, "@lerna/log-packed": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@lerna/log-packed/-/log-packed-3.0.4.tgz", - "integrity": "sha512-vVQHgMagE2wnbxhNY9nFkdu+Cx2TsyWalkJfkxbNzmo6gOCrDsxCBDj9vTEV8Q+4aWx0C0Bsc0sB2Eb8y/+ofA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/log-packed/-/log-packed-3.6.0.tgz", + "integrity": "sha512-T/J41zMkzpWB5nbiTRS5PmYTFn74mJXe6RQA2qhkdLi0UqnTp97Pux1loz3jsJf2yJtiQUnyMM7KuKIAge0Vlw==", "dev": true, "requires": { "byte-size": "^4.0.3", "columnify": "^1.5.4", "has-unicode": "^2.0.1", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/npm-conf": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/npm-conf/-/npm-conf-3.4.1.tgz", - "integrity": "sha512-i9G6DnbCqiAqxKx2rSXej/n14qxlV/XOebL6QZonxJKzNTB+Q2wglnhTXmfZXTPJfoqimLaY4NfAEtbOXRWOXQ==", + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/@lerna/npm-conf/-/npm-conf-3.7.0.tgz", + "integrity": "sha512-+WSMDfPKcKzMfqq283ydz9RRpOU6p9wfx0wy4hVSUY/6YUpsyuk8SShjcRtY8zTM5AOrxvFBuuV90H4YpZ5+Ng==", "dev": true, "requires": { "config-chain": "^1.1.11", @@ -747,125 +844,180 @@ } }, "@lerna/npm-dist-tag": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/npm-dist-tag/-/npm-dist-tag-3.3.0.tgz", - "integrity": "sha512-EtZJXzh3w5tqXEev+EBBPrWKWWn0WgJfxm4FihfS9VgyaAW8udIVZHGkIQ3f+tBtupcAzA9Q8cQNUkGF2efwmA==", + "version": "3.8.5", + "resolved": "https://registry.npmjs.org/@lerna/npm-dist-tag/-/npm-dist-tag-3.8.5.tgz", + "integrity": "sha512-VO57yKTB4NC2LZuTd4w0LmlRpoFm/gejQ1gqqLGzSJuSZaBXmieElFovzl21S07cqiy7FNVdz75x7/a6WCZ6XA==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "npmlog": "^4.1.2" + "figgy-pudding": "^3.5.1", + "libnpm": "^2.0.1" } }, "@lerna/npm-install": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/npm-install/-/npm-install-3.3.0.tgz", - "integrity": "sha512-WoVvKdS8ltROTGSNQwo6NDq0YKnjwhvTG4li1okcN/eHKOS3tL9bxbgPx7No0wOq5DKBpdeS9KhAfee6LFAZ5g==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/npm-install/-/npm-install-3.10.0.tgz", + "integrity": "sha512-/6/XyLY9/4jaMPBOVYUr4wZxQURIfwoELY0qCQ8gZ5zv4cOiFiiCUxZ0i4fxqFtD7nJ084zq1DsZW0aH0CIWYw==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", + "@lerna/child-process": "3.3.0", + "@lerna/get-npm-exec-opts": "3.6.0", "fs-extra": "^7.0.0", - "npm-package-arg": "^6.0.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "signal-exit": "^3.0.2", "write-pkg": "^3.1.0" } }, "@lerna/npm-publish": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/@lerna/npm-publish/-/npm-publish-3.3.1.tgz", - "integrity": "sha512-bVTlWIcBL6Zpyzqvr9C7rxXYcoPw+l7IPz5eqQDNREj1R39Wj18OWB2KTJq8l7LIX7Wf4C2A1uT5hJaEf9BuvA==", + "version": "3.10.7", + "resolved": "https://registry.npmjs.org/@lerna/npm-publish/-/npm-publish-3.10.7.tgz", + "integrity": "sha512-oU3/Q+eHC1fRjh7bk6Nn4tRD1OLR6XZVs3v+UWMWMrF4hVSV61pxcP5tpeI1n4gDQjSgh7seI4EzKVJe/WfraA==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "@lerna/has-npm-version": "^3.3.0", - "@lerna/log-packed": "^3.0.4", + "@lerna/run-lifecycle": "3.10.5", + "figgy-pudding": "^3.5.1", "fs-extra": "^7.0.0", - "npmlog": "^4.1.2", - "p-map": "^1.2.0" + "libnpm": "^2.0.1" } }, "@lerna/npm-run-script": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/npm-run-script/-/npm-run-script-3.3.0.tgz", - "integrity": "sha512-YqDguWZzp4jIomaE4aWMUP7MIAJAFvRAf6ziQLpqwoQskfWLqK5mW0CcszT1oLjhfb3cY3MMfSTFaqwbdKmICg==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/npm-run-script/-/npm-run-script-3.10.0.tgz", + "integrity": "sha512-c21tBXLF1Wje4tx/Td9jKIMrlZo/8QQiyyadjdKpwyyo7orSMsVNXGyJwvZ4JVVDcwC3GPU6HQvkt63v7rcyaw==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "npmlog": "^4.1.2" + "@lerna/child-process": "3.3.0", + "@lerna/get-npm-exec-opts": "3.6.0", + "libnpm": "^2.0.1" } }, "@lerna/output": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/output/-/output-3.0.0.tgz", - "integrity": "sha512-EFxnSbO0zDEVKkTKpoCUAFcZjc3gn3DwPlyTDxbeqPU7neCfxP4rA4+0a6pcOfTlRS5kLBRMx79F2TRCaMM3DA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/output/-/output-3.6.0.tgz", + "integrity": "sha512-9sjQouf6p7VQtVCRnzoTGlZyURd48i3ha3WBHC/UBJnHZFuXMqWVPKNuvnMf2kRXDyoQD+2mNywpmEJg5jOnRg==", "dev": true, "requires": { - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" + } + }, + "@lerna/pack-directory": { + "version": "3.10.5", + "resolved": "https://registry.npmjs.org/@lerna/pack-directory/-/pack-directory-3.10.5.tgz", + "integrity": "sha512-Ulj24L9XdgjJIxBr6ZjRJEoBULVH3c10lqunUdW41bswXhzhirRtQIxv0+5shngNjDwgMmJfOBcuCVKPSez4tg==", + "dev": true, + "requires": { + "@lerna/get-packed": "3.7.0", + "@lerna/package": "3.7.2", + "@lerna/run-lifecycle": "3.10.5", + "figgy-pudding": "^3.5.1", + "libnpm": "^2.0.1", + "npm-packlist": "^1.1.12", + "tar": "^4.4.8", + "temp-write": "^3.4.0" + }, + "dependencies": { + "tar": { + "version": "4.4.8", + "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", + "integrity": "sha512-LzHF64s5chPQQS0IYBn9IN5h3i98c12bo4NCO7e0sGM2llXQ3p2FGC5sdENN4cTW48O915Sh+x+EXx7XW96xYQ==", + "dev": true, + "requires": { + "chownr": "^1.1.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.3.4", + "minizlib": "^1.1.1", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.2", + "yallist": "^3.0.2" + } + }, + "yallist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", + "dev": true + } } }, "@lerna/package": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/package/-/package-3.0.0.tgz", - "integrity": "sha512-djzEJxzn212wS8d9znBnlXkeRlPL7GqeAYBykAmsuq51YGvaQK67Umh5ejdO0uxexF/4r7yRwgrlRHpQs8Rfqg==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/package/-/package-3.7.2.tgz", + "integrity": "sha512-8A5hN2CekM1a0Ix4VUO/g+REo+MsnXb8lnQ0bGjr1YGWzSL5NxYJ0Z9+0pwTfDpvRDYlFYO0rMVwBUW44b4dUw==", "dev": true, "requires": { - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", + "load-json-file": "^4.0.0", "write-pkg": "^3.1.0" + }, + "dependencies": { + "load-json-file": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", + "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "parse-json": "^4.0.0", + "pify": "^3.0.0", + "strip-bom": "^3.0.0" + } + }, + "parse-json": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", + "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", + "dev": true, + "requires": { + "error-ex": "^1.3.1", + "json-parse-better-errors": "^1.0.1" + } + }, + "strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", + "dev": true + } } }, "@lerna/package-graph": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@lerna/package-graph/-/package-graph-3.1.2.tgz", - "integrity": "sha512-9wIWb49I1IJmyjPdEVZQ13IAi9biGfH/OZHOC04U2zXGA0GLiY+B3CAx6FQvqkZ8xEGfqzmXnv3LvZ0bQfc1aQ==", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/package-graph/-/package-graph-3.10.6.tgz", + "integrity": "sha512-mpIOJbhi+xLqT9BcUrLVD4We8WUdousQf/QndbEWl8DWAW1ethtRHVsCm9ufdBB3F9nj4PH/hqnDWWwqE+rS4w==", "dev": true, "requires": { - "@lerna/validation-error": "^3.0.0", - "npm-package-arg": "^6.0.0", + "@lerna/validation-error": "3.6.0", + "libnpm": "^2.0.1", "semver": "^5.5.0" } }, "@lerna/project": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/project/-/project-3.5.0.tgz", - "integrity": "sha512-uFDzqwrD7a/tTohQoo0voTsRy2cgl9D1ZOU2pHZzHzow9S1M8E0x5q3hJI2HlwsZry9IUugmDUGO6UddTjwm3Q==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/project/-/project-3.10.0.tgz", + "integrity": "sha512-9QRl8aGHuyU4zVEELQmNPnJTlS7XHqX7w9I9isCXdnilKc2R0MyvUs21lj6Yyt6xTuQnqD158TR9tbS4QufYQQ==", "dev": true, "requires": { - "@lerna/package": "^3.0.0", - "@lerna/validation-error": "^3.0.0", + "@lerna/package": "3.7.2", + "@lerna/validation-error": "3.6.0", "cosmiconfig": "^5.0.2", "dedent": "^0.7.0", "dot-prop": "^4.2.0", "glob-parent": "^3.1.0", "globby": "^8.0.1", + "libnpm": "^2.0.1", "load-json-file": "^4.0.0", - "npmlog": "^4.1.2", "p-map": "^1.2.0", "resolve-from": "^4.0.0", "write-json-file": "^2.3.0" }, "dependencies": { - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - } - }, "globby": { - "version": "8.0.1", - "resolved": "https://registry.npmjs.org/globby/-/globby-8.0.1.tgz", - "integrity": "sha512-oMrYrJERnKBLXNLVTqhm3vPEdJ/b2ZE28xN4YARiix1NOIOBPEpOUnm844K1iu/BkphCaf2WNFwMszv8Soi1pw==", + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/globby/-/globby-8.0.2.tgz", + "integrity": "sha512-yTzMmKygLp8RUpG1Ymu2VXPSJQZjNAZPD4ywgYEaG7e4tBJeUQBO8OpXrf1RCNcEs5alsoJYPAMiIHP0cmeC7w==", "dev": true, "requires": { "array-union": "^1.0.1", - "dir-glob": "^2.0.0", + "dir-glob": "2.0.0", "fast-glob": "^2.0.2", "glob": "^7.1.2", "ignore": "^3.3.5", @@ -873,21 +1025,6 @@ "slash": "^1.0.0" } }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - }, "load-json-file": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", @@ -925,42 +1062,42 @@ } }, "@lerna/prompt": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/@lerna/prompt/-/prompt-3.3.1.tgz", - "integrity": "sha512-eJhofrUCUaItMIH6et8kI7YqHfhjWqGZoTsE+40NRCfAraOMWx+pDzfRfeoAl3qeRAH2HhNj1bkYn70FbUOxuQ==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/prompt/-/prompt-3.6.0.tgz", + "integrity": "sha512-nyAjPMolJ/ZRAAVcXrUH89C4n1SiWvLh4xWNvWYKLcf3PI5yges35sDFP/HYrM4+cEbkNFuJCRq6CxaET4PRsg==", "dev": true, "requires": { "inquirer": "^6.2.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/publish": { - "version": "3.5.1", - "resolved": "https://registry.npmjs.org/@lerna/publish/-/publish-3.5.1.tgz", - "integrity": "sha512-ltw2YdWWzev9cZRAzons5ywZh9NJARPX67meeA95oMDVMrhD4Y9VHQNJ3T8ueec/W78/4sKlMSr3ecWyPNp5bg==", - "dev": true, - "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/check-working-tree": "^3.5.0", - "@lerna/child-process": "^3.3.0", - "@lerna/collect-updates": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/describe-ref": "^3.5.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/npm-dist-tag": "^3.3.0", - "@lerna/npm-publish": "^3.3.1", - "@lerna/output": "^3.0.0", - "@lerna/prompt": "^3.3.1", - "@lerna/run-lifecycle": "^3.4.1", - "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/validation-error": "^3.0.0", - "@lerna/version": "^3.5.0", + "version": "3.10.7", + "resolved": "https://registry.npmjs.org/@lerna/publish/-/publish-3.10.7.tgz", + "integrity": "sha512-Qd8pml2l9s6GIvNX1pTnia+Ddjsm9LF3pRRoOQeugAdv2IJNf45c/83AAEyE9M2ShG5VjgxEITNW4Lg49zipjQ==", + "dev": true, + "requires": { + "@lerna/batch-packages": "3.10.6", + "@lerna/check-working-tree": "3.10.0", + "@lerna/child-process": "3.3.0", + "@lerna/collect-updates": "3.10.1", + "@lerna/command": "3.10.6", + "@lerna/describe-ref": "3.10.0", + "@lerna/log-packed": "3.6.0", + "@lerna/npm-conf": "3.7.0", + "@lerna/npm-dist-tag": "3.8.5", + "@lerna/npm-publish": "3.10.7", + "@lerna/output": "3.6.0", + "@lerna/pack-directory": "3.10.5", + "@lerna/prompt": "3.6.0", + "@lerna/pulse-till-done": "3.7.1", + "@lerna/run-lifecycle": "3.10.5", + "@lerna/run-parallel-batches": "3.0.0", + "@lerna/validation-error": "3.6.0", + "@lerna/version": "3.10.6", + "figgy-pudding": "^3.5.1", "fs-extra": "^7.0.0", - "libnpmaccess": "^3.0.0", - "npm-package-arg": "^6.0.0", - "npm-registry-fetch": "^3.8.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "p-finally": "^1.0.0", "p-map": "^1.2.0", "p-pipe": "^1.2.0", @@ -968,55 +1105,72 @@ "semver": "^5.5.0" } }, + "@lerna/pulse-till-done": { + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@lerna/pulse-till-done/-/pulse-till-done-3.7.1.tgz", + "integrity": "sha512-MzpesZeW3Mc+CiAq4zUt9qTXI9uEBBKrubYHE36voQTSkHvu/Rox6YOvfUr+U7P6k8frFPeCgGpfMDTLhiqe6w==", + "dev": true, + "requires": { + "libnpm": "^2.0.1" + } + }, "@lerna/resolve-symlink": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/resolve-symlink/-/resolve-symlink-3.3.0.tgz", - "integrity": "sha512-KmoPDcFJ2aOK2inYHbrsiO9SodedUj0L1JDvDgirVNIjMUaQe2Q6Vi4Gh+VCJcyB27JtfHioV9R2NxU72Pk2hg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/resolve-symlink/-/resolve-symlink-3.6.0.tgz", + "integrity": "sha512-TVOAEqHJSQVhNDMFCwEUZPaOETqHDQV1TQWQfC8ZlOqyaUQ7veZUbg0yfG7RPNzlSpvF0ZaGFeR0YhYDAW03GA==", "dev": true, "requires": { "fs-extra": "^7.0.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "read-cmd-shim": "^1.0.1" } }, "@lerna/rimraf-dir": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/rimraf-dir/-/rimraf-dir-3.3.0.tgz", - "integrity": "sha512-vSqOcZ4kZduiSprbt+y40qziyN3VKYh+ygiCdnbBbsaxpdKB6CfrSMUtrLhVFrqUfBHIZRzHIzgjTdtQex1KLw==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/rimraf-dir/-/rimraf-dir-3.10.0.tgz", + "integrity": "sha512-RSKSfxPURc58ERCD/PuzorR86lWEvIWNclXYGvIYM76yNGrWiDF44pGHQvB4J+Lxa5M+52ZtZC/eOC7A7YCH4g==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "npmlog": "^4.1.2", + "@lerna/child-process": "3.3.0", + "libnpm": "^2.0.1", "path-exists": "^3.0.0", "rimraf": "^2.6.2" + }, + "dependencies": { + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } } }, "@lerna/run": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/run/-/run-3.5.0.tgz", - "integrity": "sha512-BnPD52tj794xG2Xsc4FvgksyFX2CLmSR28TZw/xASEuy14NuQYMZkvbaj61SEhyOEsq7pLhHE5PpfbIv2AIFJw==", - "dev": true, - "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/npm-run-script": "^3.3.0", - "@lerna/output": "^3.0.0", - "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/timer": "^3.5.0", - "@lerna/validation-error": "^3.0.0", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/run/-/run-3.10.6.tgz", + "integrity": "sha512-KS2lWbu/8WUUscQPi9U8sPO6yYpzf/0GmODjpruR1nRi1u/tuncdjTiG+hjGAeFC1BD7YktT9Za6imIpE8RXmA==", + "dev": true, + "requires": { + "@lerna/batch-packages": "3.10.6", + "@lerna/command": "3.10.6", + "@lerna/filter-options": "3.10.6", + "@lerna/npm-run-script": "3.10.0", + "@lerna/output": "3.6.0", + "@lerna/run-parallel-batches": "3.0.0", + "@lerna/timer": "3.5.0", + "@lerna/validation-error": "3.6.0", "p-map": "^1.2.0" } }, "@lerna/run-lifecycle": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/run-lifecycle/-/run-lifecycle-3.4.1.tgz", - "integrity": "sha512-N/hi2srM9A4BWEkXccP7vCEbf4MmIuALF00DTBMvc0A/ccItwUpl3XNuM7+ADDRK0mkwE3hDw89lJ3A7f8oUQw==", + "version": "3.10.5", + "resolved": "https://registry.npmjs.org/@lerna/run-lifecycle/-/run-lifecycle-3.10.5.tgz", + "integrity": "sha512-YPmXviaxVlhcKM6IkDTIpTq24mxOuMCilo+MTr1RLoafgB9ZTmP2AHRiFt/sy14wOsq2Zqr0wJyj8KFlDYLTkA==", "dev": true, "requires": { - "@lerna/npm-conf": "^3.4.1", - "npm-lifecycle": "^2.0.0", - "npmlog": "^4.1.2" + "@lerna/npm-conf": "3.7.0", + "figgy-pudding": "^3.5.1", + "libnpm": "^2.0.1" } }, "@lerna/run-parallel-batches": { @@ -1030,79 +1184,28 @@ } }, "@lerna/symlink-binary": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/symlink-binary/-/symlink-binary-3.3.0.tgz", - "integrity": "sha512-zRo6CimhvH/VJqCFl9T4IC6syjpWyQIxEfO2sBhrapEcfwjtwbhoGgKwucsvt4rIpFazCw63jQ/AXMT27KUIHg==", + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/symlink-binary/-/symlink-binary-3.10.0.tgz", + "integrity": "sha512-6mQsG+iVjBo8cD8s24O+YgFrwDyUGfUQbK4ryalAXFHI817Zd4xlI3tjg3W99whCt6rt6D0s1fpf8eslMN6dSw==", + "dev": true, + "requires": { + "@lerna/create-symlink": "3.6.0", + "@lerna/package": "3.7.2", + "fs-extra": "^7.0.0", + "p-map": "^1.2.0" + } + }, + "@lerna/symlink-dependencies": { + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/@lerna/symlink-dependencies/-/symlink-dependencies-3.10.0.tgz", + "integrity": "sha512-vGpg5ydwGgQCuWNX5y7CRL38mGpuLhf1GRq9wMm7IGwnctEsdSNqvvE+LDgqtwEZASu5+vffYUkL0VlFXl8uWA==", "dev": true, "requires": { - "@lerna/create-symlink": "^3.3.0", - "@lerna/package": "^3.0.0", + "@lerna/create-symlink": "3.6.0", + "@lerna/resolve-symlink": "3.6.0", + "@lerna/symlink-binary": "3.10.0", "fs-extra": "^7.0.0", - "p-map": "^1.2.0", - "read-pkg": "^3.0.0" - }, - "dependencies": { - "load-json-file": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", - "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "parse-json": "^4.0.0", - "pify": "^3.0.0", - "strip-bom": "^3.0.0" - } - }, - "parse-json": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", - "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", - "dev": true, - "requires": { - "error-ex": "^1.3.1", - "json-parse-better-errors": "^1.0.1" - } - }, - "path-type": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", - "integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", - "dev": true, - "requires": { - "pify": "^3.0.0" - } - }, - "read-pkg": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", - "integrity": "sha1-nLxoaXj+5l0WwA4rGcI3/Pbjg4k=", - "dev": true, - "requires": { - "load-json-file": "^4.0.0", - "normalize-package-data": "^2.3.2", - "path-type": "^3.0.0" - } - }, - "strip-bom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", - "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", - "dev": true - } - } - }, - "@lerna/symlink-dependencies": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/symlink-dependencies/-/symlink-dependencies-3.3.0.tgz", - "integrity": "sha512-IRngSNCmuD5uBKVv23tHMvr7Mplti0lKHilFKcvhbvhAfu6m/Vclxhkfs/uLyHzG+DeRpl/9o86SQET3h4XDhg==", - "dev": true, - "requires": { - "@lerna/create-symlink": "^3.3.0", - "@lerna/resolve-symlink": "^3.3.0", - "@lerna/symlink-binary": "^3.3.0", - "fs-extra": "^7.0.0", - "p-finally": "^1.0.0", + "p-finally": "^1.0.0", "p-map": "^1.2.0", "p-map-series": "^1.0.0" } @@ -1114,34 +1217,34 @@ "dev": true }, "@lerna/validation-error": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/validation-error/-/validation-error-3.0.0.tgz", - "integrity": "sha512-5wjkd2PszV0kWvH+EOKZJWlHEqCTTKrWsvfHnHhcUaKBe/NagPZFWs+0xlsDPZ3DJt5FNfbAPAnEBQ05zLirFA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/validation-error/-/validation-error-3.6.0.tgz", + "integrity": "sha512-MWltncGO5VgMS0QedTlZCjFUMF/evRjDMMHrtVorkIB2Cp5xy0rkKa8iDBG43qpUWeG1giwi58yUlETBcWfILw==", "dev": true, "requires": { - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/version": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/version/-/version-3.5.0.tgz", - "integrity": "sha512-vxuGkUSfjJuvOIgPG7SDXVmk4GPwJF9F+uhDW9T/wJzTk4UaxL37GpBeJDo43eutQ7mwluP+t88Luwf8S3WXlA==", - "dev": true, - "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/check-working-tree": "^3.5.0", - "@lerna/child-process": "^3.3.0", - "@lerna/collect-updates": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/conventional-commits": "^3.5.0", - "@lerna/output": "^3.0.0", - "@lerna/prompt": "^3.3.1", - "@lerna/run-lifecycle": "^3.4.1", - "@lerna/validation-error": "^3.0.0", + "version": "3.10.6", + "resolved": "https://registry.npmjs.org/@lerna/version/-/version-3.10.6.tgz", + "integrity": "sha512-77peW2ROlHHl1e/tHBUmhpb8tsO6CIdlx34XapZhUuIVykrkOuqVFFxqMecrGG8SJe0e3l1G+Fah7bJTQcG0kw==", + "dev": true, + "requires": { + "@lerna/batch-packages": "3.10.6", + "@lerna/check-working-tree": "3.10.0", + "@lerna/child-process": "3.3.0", + "@lerna/collect-updates": "3.10.1", + "@lerna/command": "3.10.6", + "@lerna/conventional-commits": "3.10.0", + "@lerna/output": "3.6.0", + "@lerna/prompt": "3.6.0", + "@lerna/run-lifecycle": "3.10.5", + "@lerna/validation-error": "3.6.0", "chalk": "^2.3.1", "dedent": "^0.7.0", + "libnpm": "^2.0.1", "minimatch": "^3.0.4", - "npmlog": "^4.1.2", "p-map": "^1.2.0", "p-pipe": "^1.2.0", "p-reduce": "^1.0.0", @@ -1152,15 +1255,24 @@ } }, "@lerna/write-log-file": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/write-log-file/-/write-log-file-3.0.0.tgz", - "integrity": "sha512-SfbPp29lMeEVOb/M16lJwn4nnx5y+TwCdd7Uom9umd7KcZP0NOvpnX0PHehdonl7TyHZ1Xx2maklYuCLbQrd/A==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/write-log-file/-/write-log-file-3.6.0.tgz", + "integrity": "sha512-OkLK99V6sYXsJsYg+O9wtiFS3z6eUPaiz2e6cXJt80mfIIdI1t2dnmyua0Ib5cZWExQvx2z6Y32Wlf0MnsoNsA==", "dev": true, "requires": { - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "write-file-atomic": "^2.3.0" } }, + "@mattiasbuelens/web-streams-polyfill": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/@mattiasbuelens/web-streams-polyfill/-/web-streams-polyfill-0.2.1.tgz", + "integrity": "sha512-oKuFCQFa3W7Hj7zKn0+4ypI8JFm4ZKIoncwAC6wd5WwFW2sL7O1hpPoJdSWpynQ4DJ4lQ6MvFoVDmCLilonDFg==", + "dev": true, + "requires": { + "@types/whatwg-streams": "^0.0.7" + } + }, "@mrmlnc/readdir-enhanced": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/@mrmlnc/readdir-enhanced/-/readdir-enhanced-2.2.1.tgz", @@ -1177,15 +1289,6 @@ "integrity": "sha512-shAmDyaQC4H92APFoIaVDHCx5bStIocgvbwQyxPRrbUY20V1EYTbSDchWbuwlMG3V17cprZhA6+78JfB+3DTPw==", "dev": true }, - "@samverschueren/stream-to-observable": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/@samverschueren/stream-to-observable/-/stream-to-observable-0.3.0.tgz", - "integrity": "sha512-MI4Xx6LHs4Webyvi6EbspgyAb4D2Q2VtnCQ1blOJcoLS6mVa8lNN2rkIy1CVxfTUpoyIbCTkXES1rLXztFD1lg==", - "dev": true, - "requires": { - "any-observable": "^0.3.0" - } - }, "@sindresorhus/df": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/@sindresorhus/df/-/df-2.1.0.tgz", @@ -1225,22 +1328,16 @@ } } }, - "@std/esm": { - "version": "0.26.0", - "resolved": "https://registry.npmjs.org/@std/esm/-/esm-0.26.0.tgz", - "integrity": "sha512-g3RDuosSa5fZOzENtrZdx7Gevb3zabfn8qglug2aCJIVz/4woFpKoqm1yD3mG2RD0zJEZRnkkuPHsmNglKGl7g==", - "dev": true - }, "@types/events": { - "version": "1.2.0", - "resolved": "http://registry.npmjs.org/@types/events/-/events-1.2.0.tgz", - "integrity": "sha512-KEIlhXnIutzKwRbQkGWb/I4HFqBuUykAdHgDED6xqwXJfONCjF5VoE0cXEiurh3XauygxzeDzgtXUqvLkxFzzA==", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/@types/events/-/events-3.0.0.tgz", + "integrity": "sha512-EaObqwIvayI5a8dCzhFrjKzVwKLxjoG9T6Ppd5CEo07LRKfQ8Yokw54r5+Wq7FaBQ+yXRvQAYPrHwya1/UFt9g==", "dev": true }, "@types/flatbuffers": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/@types/flatbuffers/-/flatbuffers-1.9.0.tgz", - "integrity": "sha512-Ek+pJNTxBXBalTaTqKNwsaF3G8kfsmtYHxdWb8EUAS7dcPkSbRcNFGlaBQboXVSfSU/Vu32J3qs5Tgj56szDTw==" + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/@types/flatbuffers/-/flatbuffers-1.9.1.tgz", + "integrity": "sha512-TC3X0Nkj5wgvuY217VkodBtjbD3Yr0JNApDY1GW9IU5Mzm5ie1IJErqe4vRm+wy08IRz3bemaDATrdEw1CJlVQ==" }, "@types/fs-extra": { "version": "5.0.4", @@ -1263,9 +1360,9 @@ } }, "@types/handlebars": { - "version": "4.0.39", - "resolved": "https://registry.npmjs.org/@types/handlebars/-/handlebars-4.0.39.tgz", - "integrity": "sha512-vjaS7Q0dVqFp85QhyPSZqDKnTTCemcSHNHFvDdalO1s0Ifz5KuE64jQD5xoUkfdWwF4WpqdJEl7LsWH8rzhKJA==", + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@types/handlebars/-/handlebars-4.0.40.tgz", + "integrity": "sha512-sGWNtsjNrLOdKha2RV1UeF8+UbQnPSG7qbe5wwbni0mw4h2gHXyPFUMOC+xwGirIiiydM/HSqjDO4rk6NFB18w==", "dev": true }, "@types/highlight.js": { @@ -1275,15 +1372,15 @@ "dev": true }, "@types/jest": { - "version": "23.3.5", - "resolved": "https://registry.npmjs.org/@types/jest/-/jest-23.3.5.tgz", - "integrity": "sha512-3LI+vUC3Wju28vbjIjsTKakhMB8HC4l+tMz+Z8WRzVK+kmvezE5jcOvKtBpznWSI5KDLFo+FouUhpTKoekadCA==", + "version": "23.3.13", + "resolved": "https://registry.npmjs.org/@types/jest/-/jest-23.3.13.tgz", + "integrity": "sha512-ePl4l+7dLLmCucIwgQHAgjiepY++qcI6nb8eAwGNkB6OxmTe3Z9rQU3rSpomqu42PCCnlThZbOoxsf+qylJsLA==", "dev": true }, "@types/lodash": { - "version": "4.14.118", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.118.tgz", - "integrity": "sha512-iiJbKLZbhSa6FYRip/9ZDX6HXhayXLDGY2Fqws9cOkEQ6XeKfaxB0sC541mowZJueYyMnVUmmG+al5/4fCDrgw==", + "version": "4.14.120", + "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.120.tgz", + "integrity": "sha512-jQ21kQ120mo+IrDs1nFNVm/AsdFxIx2+vZ347DbogHJPd/JzKNMOqU6HCYin1W6v8l5R9XSO2/e9cxmn7HAnVw==", "dev": true }, "@types/marked": { @@ -1299,14 +1396,14 @@ "dev": true }, "@types/node": { - "version": "10.12.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.0.tgz", - "integrity": "sha512-3TUHC3jsBAB7qVRGxT6lWyYo2v96BMmD2PTcl47H25Lu7UXtFH/2qqmKiVrnel6Ne//0TFYf6uvNX+HW2FRkLQ==" + "version": "10.12.18", + "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz", + "integrity": "sha512-fh+pAqt4xRzPfqA6eh3Z2y6fyZavRIumvjhaCL753+TVkGKGhpPeyrJG2JftD0T9q4GF00KjefsQ+PQNDdWQaQ==" }, "@types/shelljs": { - "version": "0.8.0", - "resolved": "https://registry.npmjs.org/@types/shelljs/-/shelljs-0.8.0.tgz", - "integrity": "sha512-vs1hCC8RxLHRu2bwumNyYRNrU3o8BtZhLysH5A4I98iYmA2APl6R3uNQb5ihl+WiwH0xdC9LLO+vRrXLs/Kyxg==", + "version": "0.8.2", + "resolved": "https://registry.npmjs.org/@types/shelljs/-/shelljs-0.8.2.tgz", + "integrity": "sha512-vVp7BCQn0yUQgpiohrdxAhHdm/bTlXshB4HG3LEBq1PgvjKiyeYHohIPIv0QBt/jipb140iMS5Xy1iR6qKovKw==", "dev": true, "requires": { "@types/glob": "*", @@ -1318,175 +1415,181 @@ "resolved": "https://registry.npmjs.org/@types/text-encoding-utf-8/-/text-encoding-utf-8-1.0.1.tgz", "integrity": "sha512-GpIEYaS+yNfYqpowLLziiY42pyaL+lThd/wMh6tTubaKuG4IRkXqqyxK7Nddn3BvpUg2+go3Gv/jbXvAFMRjiQ==" }, + "@types/whatwg-streams": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/@types/whatwg-streams/-/whatwg-streams-0.0.7.tgz", + "integrity": "sha512-6sDiSEP6DWcY2ZolsJ2s39ZmsoGQ7KVwBDI3sESQsEm9P2dHTcqnDIHRZFRNtLCzWp7hCFGqYbw5GyfpQnJ01A==", + "dev": true + }, "@webassemblyjs/ast": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.7.10.tgz", - "integrity": "sha512-wTUeaByYN2EA6qVqhbgavtGc7fLTOx0glG2IBsFlrFG51uXIGlYBTyIZMf4SPLo3v1bgV/7lBN3l7Z0R6Hswew==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.7.11.tgz", + "integrity": "sha512-ZEzy4vjvTzScC+SH8RBssQUawpaInUdMTYwYYLh54/s8TuT0gBLuyUnppKsVyZEi876VmmStKsUs28UxPgdvrA==", "dev": true, "requires": { - "@webassemblyjs/helper-module-context": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/wast-parser": "1.7.10" + "@webassemblyjs/helper-module-context": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/wast-parser": "1.7.11" } }, "@webassemblyjs/floating-point-hex-parser": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.7.10.tgz", - "integrity": "sha512-gMsGbI6I3p/P1xL2UxqhNh1ga2HCsx5VBB2i5VvJFAaqAjd2PBTRULc3BpTydabUQEGlaZCzEUQhLoLG7TvEYQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.7.11.tgz", + "integrity": "sha512-zY8dSNyYcgzNRNT666/zOoAyImshm3ycKdoLsyDw/Bwo6+/uktb7p4xyApuef1dwEBo/U/SYQzbGBvV+nru2Xg==", "dev": true }, "@webassemblyjs/helper-api-error": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.7.10.tgz", - "integrity": "sha512-DoYRlPWtuw3yd5BOr9XhtrmB6X1enYF0/54yNvQWGXZEPDF5PJVNI7zQ7gkcKfTESzp8bIBWailaFXEK/jjCsw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.7.11.tgz", + "integrity": "sha512-7r1qXLmiglC+wPNkGuXCvkmalyEstKVwcueZRP2GNC2PAvxbLYwLLPr14rcdJaE4UtHxQKfFkuDFuv91ipqvXg==", "dev": true }, "@webassemblyjs/helper-buffer": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.7.10.tgz", - "integrity": "sha512-+RMU3dt/dPh4EpVX4u5jxsOlw22tp3zjqE0m3ftU2tsYxnPULb4cyHlgaNd2KoWuwasCQqn8Mhr+TTdbtj3LlA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.7.11.tgz", + "integrity": "sha512-MynuervdylPPh3ix+mKZloTcL06P8tenNH3sx6s0qE8SLR6DdwnfgA7Hc9NSYeob2jrW5Vql6GVlsQzKQCa13w==", "dev": true }, "@webassemblyjs/helper-code-frame": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-code-frame/-/helper-code-frame-1.7.10.tgz", - "integrity": "sha512-UiytbpKAULOEab2hUZK2ywXen4gWJVrgxtwY3Kn+eZaaSWaRM8z/7dAXRSoamhKFiBh1uaqxzE/XD9BLlug3gw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-code-frame/-/helper-code-frame-1.7.11.tgz", + "integrity": "sha512-T8ESC9KMXFTXA5urJcyor5cn6qWeZ4/zLPyWeEXZ03hj/x9weSokGNkVCdnhSabKGYWxElSdgJ+sFa9G/RdHNw==", "dev": true, "requires": { - "@webassemblyjs/wast-printer": "1.7.10" + "@webassemblyjs/wast-printer": "1.7.11" } }, "@webassemblyjs/helper-fsm": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-fsm/-/helper-fsm-1.7.10.tgz", - "integrity": "sha512-w2vDtUK9xeSRtt5+RnnlRCI7wHEvLjF0XdnxJpgx+LJOvklTZPqWkuy/NhwHSLP19sm9H8dWxKeReMR7sCkGZA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-fsm/-/helper-fsm-1.7.11.tgz", + "integrity": "sha512-nsAQWNP1+8Z6tkzdYlXT0kxfa2Z1tRTARd8wYnc/e3Zv3VydVVnaeePgqUzFrpkGUyhUUxOl5ML7f1NuT+gC0A==", "dev": true }, "@webassemblyjs/helper-module-context": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-module-context/-/helper-module-context-1.7.10.tgz", - "integrity": "sha512-yE5x/LzZ3XdPdREmJijxzfrf+BDRewvO0zl8kvORgSWmxpRrkqY39KZSq6TSgIWBxkK4SrzlS3BsMCv2s1FpsQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-module-context/-/helper-module-context-1.7.11.tgz", + "integrity": "sha512-JxfD5DX8Ygq4PvXDucq0M+sbUFA7BJAv/GGl9ITovqE+idGX+J3QSzJYz+LwQmL7fC3Rs+utvWoJxDb6pmC0qg==", "dev": true }, "@webassemblyjs/helper-wasm-bytecode": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.7.10.tgz", - "integrity": "sha512-u5qy4SJ/OrxKxZqJ9N3qH4ZQgHaAzsopsYwLvoWJY6Q33r8PhT3VPyNMaJ7ZFoqzBnZlCcS/0f4Sp8WBxylXfg==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.7.11.tgz", + "integrity": "sha512-cMXeVS9rhoXsI9LLL4tJxBgVD/KMOKXuFqYb5oCJ/opScWpkCMEz9EJtkonaNcnLv2R3K5jIeS4TRj/drde1JQ==", "dev": true }, "@webassemblyjs/helper-wasm-section": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.7.10.tgz", - "integrity": "sha512-Ecvww6sCkcjatcyctUrn22neSJHLN/TTzolMGG/N7S9rpbsTZ8c6Bl98GpSpV77EvzNijiNRHBG0+JO99qKz6g==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.7.11.tgz", + "integrity": "sha512-8ZRY5iZbZdtNFE5UFunB8mmBEAbSI3guwbrsCl4fWdfRiAcvqQpeqd5KHhSWLL5wuxo53zcaGZDBU64qgn4I4Q==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-buffer": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/wasm-gen": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-buffer": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/wasm-gen": "1.7.11" } }, "@webassemblyjs/ieee754": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.7.10.tgz", - "integrity": "sha512-HRcWcY+YWt4+s/CvQn+vnSPfRaD4KkuzQFt5MNaELXXHSjelHlSEA8ZcqT69q0GTIuLWZ6JaoKar4yWHVpZHsQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.7.11.tgz", + "integrity": "sha512-Mmqx/cS68K1tSrvRLtaV/Lp3NZWzXtOHUW2IvDvl2sihAwJh4ACE0eL6A8FvMyDG9abes3saB6dMimLOs+HMoQ==", "dev": true, "requires": { "@xtuc/ieee754": "^1.2.0" } }, "@webassemblyjs/leb128": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.7.10.tgz", - "integrity": "sha512-og8MciYlA8hvzCLR71hCuZKPbVBfLQeHv7ImKZ4nlyxrYbG7uJHYtHiHu6OV9SqrGuD03H/HtXC4Bgdjfm9FHw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.7.11.tgz", + "integrity": "sha512-vuGmgZjjp3zjcerQg+JA+tGOncOnJLWVkt8Aze5eWQLwTQGNgVLcyOTqgSCxWTR4J42ijHbBxnuRaL1Rv7XMdw==", "dev": true, "requires": { "@xtuc/long": "4.2.1" } }, "@webassemblyjs/utf8": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.7.10.tgz", - "integrity": "sha512-Ng6Pxv6siyZp635xCSnH3mKmIFgqWPCcGdoo0GBYgyGdxu7cUj4agV7Uu1a8REP66UYUFXJLudeGgd4RvuJAnQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.7.11.tgz", + "integrity": "sha512-C6GFkc7aErQIAH+BMrIdVSmW+6HSe20wg57HEC1uqJP8E/xpMjXqQUxkQw07MhNDSDcGpxI9G5JSNOQCqJk4sA==", "dev": true }, "@webassemblyjs/wasm-edit": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.7.10.tgz", - "integrity": "sha512-e9RZFQlb+ZuYcKRcW9yl+mqX/Ycj9+3/+ppDI8nEE/NCY6FoK8f3dKBcfubYV/HZn44b+ND4hjh+4BYBt+sDnA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.7.11.tgz", + "integrity": "sha512-FUd97guNGsCZQgeTPKdgxJhBXkUbMTY6hFPf2Y4OedXd48H97J+sOY2Ltaq6WGVpIH8o/TGOVNiVz/SbpEMJGg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-buffer": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/helper-wasm-section": "1.7.10", - "@webassemblyjs/wasm-gen": "1.7.10", - "@webassemblyjs/wasm-opt": "1.7.10", - "@webassemblyjs/wasm-parser": "1.7.10", - "@webassemblyjs/wast-printer": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-buffer": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/helper-wasm-section": "1.7.11", + "@webassemblyjs/wasm-gen": "1.7.11", + "@webassemblyjs/wasm-opt": "1.7.11", + "@webassemblyjs/wasm-parser": "1.7.11", + "@webassemblyjs/wast-printer": "1.7.11" } }, "@webassemblyjs/wasm-gen": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.7.10.tgz", - "integrity": "sha512-M0lb6cO2Y0PzDye/L39PqwV+jvO+2YxEG5ax+7dgq7EwXdAlpOMx1jxyXJTScQoeTpzOPIb+fLgX/IkLF8h2yw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.7.11.tgz", + "integrity": "sha512-U/KDYp7fgAZX5KPfq4NOupK/BmhDc5Kjy2GIqstMhvvdJRcER/kUsMThpWeRP8BMn4LXaKhSTggIJPOeYHwISA==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/ieee754": "1.7.10", - "@webassemblyjs/leb128": "1.7.10", - "@webassemblyjs/utf8": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/ieee754": "1.7.11", + "@webassemblyjs/leb128": "1.7.11", + "@webassemblyjs/utf8": "1.7.11" } }, "@webassemblyjs/wasm-opt": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.7.10.tgz", - "integrity": "sha512-R66IHGCdicgF5ZliN10yn5HaC7vwYAqrSVJGjtJJQp5+QNPBye6heWdVH/at40uh0uoaDN/UVUfXK0gvuUqtVg==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.7.11.tgz", + "integrity": "sha512-XynkOwQyiRidh0GLua7SkeHvAPXQV/RxsUeERILmAInZegApOUAIJfRuPYe2F7RcjOC9tW3Cb9juPvAC/sCqvg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-buffer": "1.7.10", - "@webassemblyjs/wasm-gen": "1.7.10", - "@webassemblyjs/wasm-parser": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-buffer": "1.7.11", + "@webassemblyjs/wasm-gen": "1.7.11", + "@webassemblyjs/wasm-parser": "1.7.11" } }, "@webassemblyjs/wasm-parser": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.7.10.tgz", - "integrity": "sha512-AEv8mkXVK63n/iDR3T693EzoGPnNAwKwT3iHmKJNBrrALAhhEjuPzo/lTE4U7LquEwyvg5nneSNdTdgrBaGJcA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.7.11.tgz", + "integrity": "sha512-6lmXRTrrZjYD8Ng8xRyvyXQJYUQKYSXhJqXOBLw24rdiXsHAOlvw5PhesjdcaMadU/pyPQOJ5dHreMjBxwnQKg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-api-error": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/ieee754": "1.7.10", - "@webassemblyjs/leb128": "1.7.10", - "@webassemblyjs/utf8": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-api-error": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/ieee754": "1.7.11", + "@webassemblyjs/leb128": "1.7.11", + "@webassemblyjs/utf8": "1.7.11" } }, "@webassemblyjs/wast-parser": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-parser/-/wast-parser-1.7.10.tgz", - "integrity": "sha512-YTPEtOBljkCL0VjDp4sHe22dAYSm3ZwdJ9+2NTGdtC7ayNvuip1wAhaAS8Zt9Q6SW9E5Jf5PX7YE3XWlrzR9cw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-parser/-/wast-parser-1.7.11.tgz", + "integrity": "sha512-lEyVCg2np15tS+dm7+JJTNhNWq9yTZvi3qEhAIIOaofcYlUp0UR5/tVqOwa/gXYr3gjwSZqw+/lS9dscyLelbQ==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/floating-point-hex-parser": "1.7.10", - "@webassemblyjs/helper-api-error": "1.7.10", - "@webassemblyjs/helper-code-frame": "1.7.10", - "@webassemblyjs/helper-fsm": "1.7.10", + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/floating-point-hex-parser": "1.7.11", + "@webassemblyjs/helper-api-error": "1.7.11", + "@webassemblyjs/helper-code-frame": "1.7.11", + "@webassemblyjs/helper-fsm": "1.7.11", "@xtuc/long": "4.2.1" } }, "@webassemblyjs/wast-printer": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.7.10.tgz", - "integrity": "sha512-mJ3QKWtCchL1vhU/kZlJnLPuQZnlDOdZsyP0bbLWPGdYsQDnSBvyTLhzwBA3QAMlzEL9V4JHygEmK6/OTEyytA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.7.11.tgz", + "integrity": "sha512-m5vkAsuJ32QpkdkDOUPGSltrg8Cuk3KBx4YrmAGQwCZPRdUHXxG4phIOuuycLemHFr74sWL9Wthqss4fzdzSwg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/wast-parser": "1.7.10", + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/wast-parser": "1.7.11", "@xtuc/long": "4.2.1" } }, @@ -1531,13 +1634,10 @@ "dev": true }, "acorn-dynamic-import": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/acorn-dynamic-import/-/acorn-dynamic-import-3.0.0.tgz", - "integrity": "sha512-zVWV8Z8lislJoOKKqdNMOB+s6+XV5WERty8MnKBeFgwA+19XJjJHs2RP5dzM57FftIs+jQnRToLiWazKr6sSWg==", - "dev": true, - "requires": { - "acorn": "^5.0.0" - } + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/acorn-dynamic-import/-/acorn-dynamic-import-4.0.0.tgz", + "integrity": "sha512-d3OEjQV4ROpoflsnUA8HozoIR504TFxNivYEUi6uwz0IYhBkTDXGuWlNdMtybRt3nqVx/L6XqMt0FxkXuWKZhw==", + "dev": true }, "acorn-globals": { "version": "4.3.0", @@ -1550,17 +1650,17 @@ }, "dependencies": { "acorn": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.0.2.tgz", - "integrity": "sha512-GXmKIvbrN3TV7aVqAzVFaMW8F8wzVX7voEBRO3bDA64+EX37YSayggRJP5Xig6HYHBkWKpFg9W5gg6orklubhg==", + "version": "6.0.5", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.0.5.tgz", + "integrity": "sha512-i33Zgp3XWtmZBMNvCr4azvOFeWVw1Rk6p3hfi3LUDvIFraOMywb1kAtrbi+med14m4Xfpqm3zRZMT+c0FNE7kg==", "dev": true } } }, "acorn-walk": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-6.1.0.tgz", - "integrity": "sha512-ugTb7Lq7u4GfWSqqpwE0bGyoBZNMTok/zDBXxfEG0QM50jNlGhIWjRC1pPN7bvV1anhF+bs+/gNcRw+o55Evbg==", + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-6.1.1.tgz", + "integrity": "sha512-OtUw6JUTgxA2QoqqmrmQ7F2NYqiBPi/L2jqHyFtllhOUvXYQXf0Z1CYUinIfyT4bTCGmrA7gX9FvHA81uzCoVw==", "dev": true }, "agent-base": { @@ -1582,34 +1682,37 @@ } }, "ajv": { - "version": "5.5.2", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-5.5.2.tgz", - "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=", + "version": "6.7.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.7.0.tgz", + "integrity": "sha512-RZXPviBTtfmtka9n9sy1N5M5b82CbxWIR6HIis4s3WQTXDJamc/0gpCWNGz6EWdWp4DOfjzJfhz/AS9zVPjjWg==", "dev": true, "requires": { - "co": "^4.6.0", - "fast-deep-equal": "^1.0.0", + "fast-deep-equal": "^2.0.1", "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.3.0" + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" } }, "ajv-errors": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/ajv-errors/-/ajv-errors-1.0.0.tgz", - "integrity": "sha1-7PAh+hCP0X37Xms4Py3SM+Mf/Fk=", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/ajv-errors/-/ajv-errors-1.0.1.tgz", + "integrity": "sha512-DCRfO/4nQ+89p/RK43i8Ezd41EqdGIU4ld7nGF8OQ14oc/we5rEntLCUa7+jrn3nn83BosfwZA0wb4pon2o8iQ==", "dev": true }, "ajv-keywords": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.2.0.tgz", - "integrity": "sha1-6GuBnGAs+IIa1jdBNpjx3sAhhHo=", + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.3.0.tgz", + "integrity": "sha512-CMzN9S62ZOO4sA/mJZIO4S++ZM7KFWzH3PPWkveLhy4OZ9i1/VatgwWMD46w/XbGCBy7Ye0gCk+Za6mmyfKK7g==", "dev": true }, "ansi-colors": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-2.0.5.tgz", - "integrity": "sha512-yAdfUZ+c2wetVNIFsNRn44THW+Lty6S5TwMpUfLA/UaGhiXbBv/F8E60/1hMLd0cnF/CDoWH8vzVaI5bAcHCjw==", - "dev": true + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-1.1.0.tgz", + "integrity": "sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==", + "dev": true, + "requires": { + "ansi-wrap": "^0.1.0" + } }, "ansi-escapes": { "version": "3.1.0", @@ -1646,20 +1749,14 @@ "integrity": "sha1-qCJQ3bABXponyoLoLqYDu/pF768=", "dev": true }, - "any-observable": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/any-observable/-/any-observable-0.3.0.tgz", - "integrity": "sha512-/FQM1EDkTsf63Ub2C6O7GuYFDsSXUwsaZDurV0np41ocwq0jthUAYCmhBX9f+KwlaCgIuWyr/4WlUQUBfKfZog==", - "dev": true - }, "anymatch": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-1.3.2.tgz", - "integrity": "sha512-0XNayC8lTHQ2OI8aljNCN3sSx6hsr/1+rlcDAotXJR7C1oZZHCNsfpbKwMjRA3Uqb5tF1Rae2oloTr4xpq+WjA==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", + "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", "dev": true, "requires": { - "micromatch": "^2.1.5", - "normalize-path": "^2.0.0" + "micromatch": "^3.1.4", + "normalize-path": "^2.1.1" } }, "append-buffer": { @@ -1702,6 +1799,12 @@ "readable-stream": "^2.0.6" } }, + "arg": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.0.tgz", + "integrity": "sha512-ZWc51jO3qegGkVh8Hwpv636EkbesNV5ZNQPCtRa+0qytRYPEs9IYT9qITY9buezqUH5uqyzlWLcufrzU2rffdg==", + "dev": true + }, "argparse": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", @@ -1721,13 +1824,10 @@ } }, "arr-diff": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", - "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", - "dev": true, - "requires": { - "arr-flatten": "^1.0.1" - } + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", + "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", + "dev": true }, "arr-filter": { "version": "1.1.2", @@ -1891,9 +1991,9 @@ "dev": true }, "array-unique": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", - "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", + "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", "dev": true }, "arrify": { @@ -1945,7 +2045,7 @@ }, "util": { "version": "0.10.3", - "resolved": "http://registry.npmjs.org/util/-/util-0.10.3.tgz", + "resolved": "https://registry.npmjs.org/util/-/util-0.10.3.tgz", "integrity": "sha1-evsa/lCAUkZInj23/g7TeTNqwPk=", "dev": true, "requires": { @@ -1991,14 +2091,6 @@ "once": "^1.3.2", "process-nextick-args": "^1.0.7", "stream-exhaust": "^1.0.1" - }, - "dependencies": { - "process-nextick-args": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", - "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", - "dev": true - } } }, "async-each": { @@ -2065,7 +2157,7 @@ }, "chalk": { "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", "dev": true, "requires": { @@ -2158,7 +2250,7 @@ }, "babel-plugin-istanbul": { "version": "4.1.6", - "resolved": "http://registry.npmjs.org/babel-plugin-istanbul/-/babel-plugin-istanbul-4.1.6.tgz", + "resolved": "https://registry.npmjs.org/babel-plugin-istanbul/-/babel-plugin-istanbul-4.1.6.tgz", "integrity": "sha512-PWP9FQ1AhZhS01T/4qLSKoHGY/xvkZdVBGlKM/HuxxS3+sC66HhTNR7+MpbO/so/cz/wY94MeSWJuP1hXIPfwQ==", "dev": true, "requires": { @@ -2166,6 +2258,17 @@ "find-up": "^2.1.0", "istanbul-lib-instrument": "^1.10.1", "test-exclude": "^4.2.1" + }, + "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + } } }, "babel-plugin-jest-hoist": { @@ -2176,32 +2279,10 @@ }, "babel-plugin-syntax-object-rest-spread": { "version": "6.13.0", - "resolved": "http://registry.npmjs.org/babel-plugin-syntax-object-rest-spread/-/babel-plugin-syntax-object-rest-spread-6.13.0.tgz", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-object-rest-spread/-/babel-plugin-syntax-object-rest-spread-6.13.0.tgz", "integrity": "sha1-/WU28rzhODb/o6VFjEkDpZe7O/U=", "dev": true }, - "babel-plugin-transform-es2015-modules-commonjs": { - "version": "6.26.2", - "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-commonjs/-/babel-plugin-transform-es2015-modules-commonjs-6.26.2.tgz", - "integrity": "sha512-CV9ROOHEdrjcwhIaJNBGMBCodN+1cfkwtM1SbUHmvyy35KGT7fohbpOxkE2uLz1o6odKK2Ck/tz47z+VqQfi9Q==", - "dev": true, - "requires": { - "babel-plugin-transform-strict-mode": "^6.24.1", - "babel-runtime": "^6.26.0", - "babel-template": "^6.26.0", - "babel-types": "^6.26.0" - } - }, - "babel-plugin-transform-strict-mode": { - "version": "6.24.1", - "resolved": "https://registry.npmjs.org/babel-plugin-transform-strict-mode/-/babel-plugin-transform-strict-mode-6.24.1.tgz", - "integrity": "sha1-1fr3qleKZbvlkc9e2uBKDGcCB1g=", - "dev": true, - "requires": { - "babel-runtime": "^6.22.0", - "babel-types": "^6.24.1" - } - }, "babel-preset-jest": { "version": "23.2.0", "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-23.2.0.tgz", @@ -2360,18 +2441,6 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -2390,12 +2459,6 @@ "tweetnacl": "^0.14.3" } }, - "beeper": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/beeper/-/beeper-1.1.1.tgz", - "integrity": "sha1-5tXqjF2tABMEpwsiY4RH9pyy+Ak=", - "dev": true - }, "benchmark": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz", @@ -2407,11 +2470,24 @@ } }, "big.js": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/big.js/-/big.js-3.2.0.tgz", - "integrity": "sha512-+hN/Zh2D08Mx65pZ/4g5bsmNiZUuChDiQfTUQ7qJr4/kuopCr88xZsAXv6mBoZEsUI4OuGHlX59qE94K2mMW8Q==", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/big.js/-/big.js-5.2.2.tgz", + "integrity": "sha512-vyL2OymJxmarO8gxMr0mhChsO9QGwhynfuu4+MHTAW6czfq9humCB7rKpUjDd9YUiDPU4mzpyupFSvOClAwbmQ==", "dev": true }, + "bin-links": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/bin-links/-/bin-links-1.1.2.tgz", + "integrity": "sha512-8eEHVgYP03nILphilltWjeIjMbKyJo3wvp9K816pHbhP301ismzw15mxAAEVQ/USUwcP++1uNrbERbp8lOA6Fg==", + "dev": true, + "requires": { + "bluebird": "^3.5.0", + "cmd-shim": "^2.0.2", + "gentle-fs": "^2.0.0", + "graceful-fs": "^4.1.11", + "write-file-atomic": "^2.3.0" + } + }, "binary-extensions": { "version": "1.12.0", "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-1.12.0.tgz", @@ -2428,9 +2504,9 @@ } }, "bluebird": { - "version": "3.5.2", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.2.tgz", - "integrity": "sha512-dhHTWMI7kMx5whMQntl7Vr9C6BvV10lFXDAasnqnrMYhXVCzzk6IO9Fo2L75jXHT07WrOngL1WDXOp+yYS91Yg==", + "version": "3.5.3", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", + "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==", "dev": true }, "bn.js": { @@ -2450,14 +2526,32 @@ } }, "braces": { - "version": "1.8.5", - "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", - "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", + "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", "dev": true, "requires": { - "expand-range": "^1.8.1", - "preserve": "^0.2.0", - "repeat-element": "^1.1.2" + "arr-flatten": "^1.1.0", + "array-unique": "^0.3.2", + "extend-shallow": "^2.0.1", + "fill-range": "^4.0.0", + "isobject": "^3.0.1", + "repeat-element": "^1.1.2", + "snapdragon": "^0.8.1", + "snapdragon-node": "^2.0.1", + "split-string": "^3.0.2", + "to-regex": "^3.0.1" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } } }, "brorand": { @@ -2491,7 +2585,7 @@ }, "browserify-aes": { "version": "1.2.0", - "resolved": "http://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", + "resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", "integrity": "sha512-+7CHXqGuspUn/Sl5aO7Ea0xWGAtETPXNSAjHo48JfLdPWcMng33Xe4znFvQweqc/uzk5zSOI3H52CYnjCfb5hA==", "dev": true, "requires": { @@ -2528,7 +2622,7 @@ }, "browserify-rsa": { "version": "4.0.1", - "resolved": "http://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", + "resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", "integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=", "dev": true, "requires": { @@ -2560,6 +2654,15 @@ "pako": "~1.0.5" } }, + "bs-logger": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/bs-logger/-/bs-logger-0.2.6.tgz", + "integrity": "sha512-pd8DCoxmbgc7hyPKOvxtqNcjYoOsABPQdcCUjGp3d42VR2CX1ORhk2A87oqqu5R1kk+76nsxZupkmyd+MVtCog==", + "dev": true, + "requires": { + "fast-json-stable-stringify": "2.x" + } + }, "bser": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/bser/-/bser-2.0.0.tgz", @@ -2571,7 +2674,7 @@ }, "buffer": { "version": "4.9.1", - "resolved": "http://registry.npmjs.org/buffer/-/buffer-4.9.1.tgz", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-4.9.1.tgz", "integrity": "sha1-bRu2AbB6TvztlwlBMgkwJ8lbwpg=", "dev": true, "requires": { @@ -2629,32 +2732,47 @@ "dev": true }, "cacache": { - "version": "11.2.0", - "resolved": "https://registry.npmjs.org/cacache/-/cacache-11.2.0.tgz", - "integrity": "sha512-IFWl6lfK6wSeYCHUXh+N1lY72UDrpyrYQJNIVQf48paDuWbv5RbAtJYf/4gUQFObTCHZwdZ5sI8Iw7nqwP6nlQ==", + "version": "11.3.2", + "resolved": "https://registry.npmjs.org/cacache/-/cacache-11.3.2.tgz", + "integrity": "sha512-E0zP4EPGDOaT2chM08Als91eYnf8Z+eH1awwwVsngUmgppfM5jjJ8l3z5vO5p5w/I3LsiXawb1sW0VY65pQABg==", "dev": true, "requires": { - "bluebird": "^3.5.1", - "chownr": "^1.0.1", - "figgy-pudding": "^3.1.0", - "glob": "^7.1.2", - "graceful-fs": "^4.1.11", - "lru-cache": "^4.1.3", + "bluebird": "^3.5.3", + "chownr": "^1.1.1", + "figgy-pudding": "^3.5.1", + "glob": "^7.1.3", + "graceful-fs": "^4.1.15", + "lru-cache": "^5.1.1", "mississippi": "^3.0.0", "mkdirp": "^0.5.1", "move-concurrently": "^1.0.1", "promise-inflight": "^1.0.1", "rimraf": "^2.6.2", - "ssri": "^6.0.0", - "unique-filename": "^1.1.0", + "ssri": "^6.0.1", + "unique-filename": "^1.1.1", "y18n": "^4.0.0" }, "dependencies": { + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, "y18n": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", "integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==", "dev": true + }, + "yallist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", + "dev": true } } }, @@ -2673,14 +2791,6 @@ "to-object-path": "^0.3.0", "union-value": "^1.0.0", "unset-value": "^1.0.0" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "call-me-maybe": { @@ -2689,6 +2799,24 @@ "integrity": "sha1-JtII6onje1y95gJQoV8DHBak1ms=", "dev": true }, + "caller-callsite": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/caller-callsite/-/caller-callsite-2.0.0.tgz", + "integrity": "sha1-hH4PzgoiN1CpoCfFSzNzGtMVQTQ=", + "dev": true, + "requires": { + "callsites": "^2.0.0" + } + }, + "caller-path": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/caller-path/-/caller-path-2.0.0.tgz", + "integrity": "sha1-Ro+DBE42mrIBD6xfBs7uFbsssfQ=", + "dev": true, + "requires": { + "caller-callsite": "^2.0.0" + } + }, "callsites": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-2.0.0.tgz", @@ -2736,9 +2864,9 @@ "dev": true }, "chalk": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", - "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", + "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==", "requires": { "ansi-styles": "^3.2.1", "escape-string-regexp": "^1.0.5", @@ -2752,20 +2880,24 @@ "dev": true }, "chokidar": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", - "integrity": "sha1-eY5ol3gVHIB2tLNg5e3SjNortGg=", + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", + "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", "dev": true, "requires": { - "anymatch": "^1.3.0", + "anymatch": "^2.0.0", "async-each": "^1.0.0", - "fsevents": "^1.0.0", - "glob-parent": "^2.0.0", + "braces": "^2.3.0", + "fsevents": "^1.2.2", + "glob-parent": "^3.1.0", "inherits": "^2.0.1", "is-binary-path": "^1.0.0", - "is-glob": "^2.0.0", + "is-glob": "^4.0.0", + "lodash.debounce": "^4.0.8", + "normalize-path": "^2.1.1", "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0" + "readdirp": "^2.0.0", + "upath": "^1.0.5" } }, "chownr": { @@ -2819,12 +2951,6 @@ "requires": { "is-descriptor": "^0.1.0" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true } } }, @@ -2837,16 +2963,6 @@ "restore-cursor": "^2.0.0" } }, - "cli-truncate": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-0.2.1.tgz", - "integrity": "sha1-nxXPuwcFAFNpIWxiasfQWrkN1XQ=", - "dev": true, - "requires": { - "slice-ansi": "0.0.4", - "string-width": "^1.0.1" - } - }, "cli-width": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-2.2.0.tgz", @@ -2891,6 +3007,14 @@ "inherits": "^2.0.1", "process-nextick-args": "^2.0.0", "readable-stream": "^2.3.5" + }, + "dependencies": { + "process-nextick-args": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", + "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "dev": true + } } }, "cmd-shim": { @@ -2924,17 +3048,6 @@ "arr-map": "^2.0.2", "for-own": "^1.0.0", "make-iterator": "^1.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - } } }, "collection-visit": { @@ -3127,11 +3240,14 @@ "through2": "^2.0.0" }, "dependencies": { - "dateformat": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", - "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", - "dev": true + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } }, "load-json-file": { "version": "4.0.0", @@ -3215,14 +3331,6 @@ "semver": "^5.5.0", "split": "^1.0.0", "through2": "^2.0.0" - }, - "dependencies": { - "dateformat": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", - "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", - "dev": true - } } }, "conventional-commits-filter": { @@ -3306,9 +3414,9 @@ } }, "core-js": { - "version": "2.5.7", - "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.5.7.tgz", - "integrity": "sha512-RszJCAxg/PP6uzXVXL6BsxSXx/B05oJAQ2vkJRjyjrEcNVycaqOmNb5OTxZPE3xa5gwZduqza6L9JOCenh/Ecw==", + "version": "2.6.3", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.6.3.tgz", + "integrity": "sha512-l00tmFFZOBHtYhN4Cz7k32VM7vTn3rE2ANjQDxdEN6zmXZ/xq1jQuutnmHvMG1ZJ7xd72+TA5YpUK8wz3rWsfQ==", "dev": true }, "core-util-is": { @@ -3318,11 +3426,12 @@ "dev": true }, "cosmiconfig": { - "version": "5.0.6", - "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-5.0.6.tgz", - "integrity": "sha512-6DWfizHriCrFWURP1/qyhsiFvYdlJzbCzmtFWh744+KyWsJo5+kPzUZZaMRSSItoYc0pxFX7gEO7ZC1/gN/7AQ==", + "version": "5.0.7", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-5.0.7.tgz", + "integrity": "sha512-PcLqxTKiDmNT6pSpy4N6KtuPwb53W+2tzNvwOZw0WH9N6O0vLIBq0x8aj8Oj75ere4YcGi48bDFCL+3fRJdlNA==", "dev": true, "requires": { + "import-fresh": "^2.0.0", "is-directory": "^0.3.1", "js-yaml": "^3.9.0", "parse-json": "^4.0.0" @@ -3354,25 +3463,6 @@ "request": "^2.85.0" } }, - "cpx": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/cpx/-/cpx-1.5.0.tgz", - "integrity": "sha1-GFvgGFEdhycN7czCkxceN2VauI8=", - "dev": true, - "requires": { - "babel-runtime": "^6.9.2", - "chokidar": "^1.6.0", - "duplexer": "^0.1.1", - "glob": "^7.0.5", - "glob2base": "^0.0.12", - "minimatch": "^3.0.2", - "mkdirp": "^0.5.1", - "resolve": "^1.1.7", - "safe-buffer": "^5.0.1", - "shell-quote": "^1.6.1", - "subarg": "^1.0.0" - } - }, "create-ecdh": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/create-ecdh/-/create-ecdh-4.0.3.tgz", @@ -3385,7 +3475,7 @@ }, "create-hash": { "version": "1.2.0", - "resolved": "http://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", + "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", "integrity": "sha512-z00bCGNHDG8mHAkP7CtT1qVu+bFQUPjYq/4Iv3C3kWjTFV10zIjfSoeqXo9Asws8gwSHDGj/hl2u4OGIjapeCg==", "dev": true, "requires": { @@ -3398,7 +3488,7 @@ }, "create-hmac": { "version": "1.1.7", - "resolved": "http://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", + "resolved": "https://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", "integrity": "sha512-MJG9liiZ+ogc4TzUwuvbER1JRdgvUFSB5+VR/g5h82fGaIRWMWddtKBHi7/sVhfjQZ6SehlyhvQYrcYkaUIpLg==", "dev": true, "requires": { @@ -3423,7 +3513,7 @@ }, "cross-spawn-async": { "version": "2.2.5", - "resolved": "http://registry.npmjs.org/cross-spawn-async/-/cross-spawn-async-2.2.5.tgz", + "resolved": "https://registry.npmjs.org/cross-spawn-async/-/cross-spawn-async-2.2.5.tgz", "integrity": "sha1-hF/wwINKPe2dFg2sptOQkGuyiMw=", "dev": true, "requires": { @@ -3502,7 +3592,7 @@ }, "d": { "version": "1.0.0", - "resolved": "http://registry.npmjs.org/d/-/d-1.0.0.tgz", + "resolved": "https://registry.npmjs.org/d/-/d-1.0.0.tgz", "integrity": "sha1-dUu1v+VUUdpppYuU1F9MWwRi1Y8=", "dev": true, "requires": { @@ -3551,12 +3641,6 @@ } } }, - "date-fns": { - "version": "1.29.0", - "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-1.29.0.tgz", - "integrity": "sha512-lbTXWZ6M20cWH8N9S6afb0SBm6tMk+uUg6z3MqHPKE9atmsY3kJkTm8vKe93izJ2B2+q5MV990sM2CHgtAZaOw==", - "dev": true - }, "date-now": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/date-now/-/date-now-0.1.4.tgz", @@ -3564,9 +3648,9 @@ "dev": true }, "dateformat": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-2.2.0.tgz", - "integrity": "sha1-QGXiATz5+5Ft39gu+1Bq1MZ2kGI=", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", + "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", "dev": true }, "debug": { @@ -3755,18 +3839,6 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -3845,7 +3917,7 @@ }, "diffie-hellman": { "version": "5.0.3", - "resolved": "http://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", + "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", "integrity": "sha512-kqag/Nl+f3GwyK25fhUMYj81BUOrZ9IuJsjIcDE5icNM9FJHAVm3VcUDxdLPoQtTuUylWm6ZIknYJwwaPxsUzg==", "dev": true, "requires": { @@ -3901,45 +3973,10 @@ }, "duplexer": { "version": "0.1.1", - "resolved": "http://registry.npmjs.org/duplexer/-/duplexer-0.1.1.tgz", + "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.1.tgz", "integrity": "sha1-rOb/gIwc5mtX0ev5eXessCM0z8E=", "dev": true }, - "duplexer2": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.0.2.tgz", - "integrity": "sha1-xhTc9n4vsUmVqRcR5aYX6KYKMds=", - "dev": true, - "requires": { - "readable-stream": "~1.1.9" - }, - "dependencies": { - "isarray": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", - "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", - "dev": true - }, - "readable-stream": { - "version": "1.1.14", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz", - "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=", - "dev": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.1", - "isarray": "0.0.1", - "string_decoder": "~0.10.x" - } - }, - "string_decoder": { - "version": "0.10.31", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", - "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", - "dev": true - } - } - }, "duplexify": { "version": "3.6.1", "resolved": "https://registry.npmjs.org/duplexify/-/duplexify-3.6.1.tgz", @@ -3972,12 +4009,6 @@ "safer-buffer": "^2.1.0" } }, - "elegant-spinner": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/elegant-spinner/-/elegant-spinner-1.0.1.tgz", - "integrity": "sha1-2wQ1IcldfjA/2PNFvtwzSc+wcp4=", - "dev": true - }, "elliptic": { "version": "6.4.1", "resolved": "https://registry.npmjs.org/elliptic/-/elliptic-6.4.1.tgz", @@ -4053,16 +4084,17 @@ } }, "es-abstract": { - "version": "1.12.0", - "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.12.0.tgz", - "integrity": "sha512-C8Fx/0jFmV5IPoMOFPA9P9G5NtqW+4cOPit3MIuvR2t7Ag2K15EJTpxnHAYTzL+aYQJIESYeXZmDBfOBE1HcpA==", + "version": "1.13.0", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz", + "integrity": "sha512-vDZfg/ykNxQVwup/8E1BZhVzFfBxs9NqMzGcvIJrqg5k2/5Za2bWo40dK2J1pgLngZ7c+Shh8lwYtLGyrwPutg==", "dev": true, "requires": { - "es-to-primitive": "^1.1.1", + "es-to-primitive": "^1.2.0", "function-bind": "^1.1.1", - "has": "^1.0.1", - "is-callable": "^1.1.3", - "is-regex": "^1.0.4" + "has": "^1.0.3", + "is-callable": "^1.1.4", + "is-regex": "^1.0.4", + "object-keys": "^1.0.12" } }, "es-to-primitive": { @@ -4077,9 +4109,9 @@ } }, "es5-ext": { - "version": "0.10.46", - "resolved": "https://registry.npmjs.org/es5-ext/-/es5-ext-0.10.46.tgz", - "integrity": "sha512-24XxRvJXNFwEMpJb3nOkiRJKRoupmjYmOPVlI65Qy2SrtxwOTB+g6ODjBKOtwEHbYrhWRty9xxOWLNdClT2djw==", + "version": "0.10.47", + "resolved": "https://registry.npmjs.org/es5-ext/-/es5-ext-0.10.47.tgz", + "integrity": "sha512-/1TItLfj+TTfWoeRcDn/0FbGV6SNo4R+On2GGVucPU/j3BWnXE2Co8h8CTo4Tu34gFJtnmwS9xiScKs4EjZhdw==", "dev": true, "requires": { "es6-iterator": "~2.0.3", @@ -4112,7 +4144,7 @@ }, "es6-promisify": { "version": "5.0.0", - "resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "dev": true, "requires": { @@ -4190,6 +4222,12 @@ "estraverse": "^4.1.1" } }, + "esm": { + "version": "3.1.4", + "resolved": "https://registry.npmjs.org/esm/-/esm-3.1.4.tgz", + "integrity": "sha512-GScwIz0110RTNzBmAQEdqaAYkD9zVhj2Jo+jeizjIcdyTw+C6S0Zv/dlPYgfF41hRTu2f1vQYliubzIkusx2gA==", + "dev": true + }, "esprima": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", @@ -4228,9 +4266,9 @@ } }, "events": { - "version": "1.1.1", - "resolved": "http://registry.npmjs.org/events/-/events-1.1.1.tgz", - "integrity": "sha1-nr23Y1rQmccNzEwqH1AEKI6L2SQ=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.0.0.tgz", + "integrity": "sha512-Dc381HFWJzEOhQ+d8pkNon++bk9h6cdAoAj4iE6Q4y6xgTzySWXlKn05/TVNpjnfRqi/X0EpJEJohPjNI3zpVA==", "dev": true }, "evp_bytestokey": { @@ -4273,19 +4311,39 @@ "integrity": "sha1-BjJjj42HfMghB9MKD/8aF8uhzQw=", "dev": true }, - "exit-hook": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/exit-hook/-/exit-hook-1.1.1.tgz", - "integrity": "sha1-8FyiM7SMBdVP/wd2XfhQfpXAL/g=", - "dev": true - }, "expand-brackets": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", - "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", + "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", "dev": true, "requires": { - "is-posix-bracket": "^0.1.0" + "debug": "^2.3.3", + "define-property": "^0.2.5", + "extend-shallow": "^2.0.1", + "posix-character-classes": "^0.1.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + }, + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } } }, "expand-range": { @@ -4295,6 +4353,48 @@ "dev": true, "requires": { "fill-range": "^2.1.0" + }, + "dependencies": { + "fill-range": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.4.tgz", + "integrity": "sha512-cnrcCbj01+j2gTG921VZPnHbjmdAf8oQV/iGeV2kZxGSyfYjjTyY79ErsK1WJWMpw6DaApEX72binqJE+/d+5Q==", + "dev": true, + "requires": { + "is-number": "^2.1.0", + "isobject": "^2.0.0", + "randomatic": "^3.0.0", + "repeat-element": "^1.1.2", + "repeat-string": "^1.5.2" + } + }, + "is-number": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", + "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", + "dev": true, + "requires": { + "kind-of": "^3.0.2" + } + }, + "isobject": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", + "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", + "dev": true, + "requires": { + "isarray": "1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "expand-tilde": { @@ -4359,262 +4459,37 @@ } }, "extglob": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", - "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", - "dev": true, - "requires": { - "is-extglob": "^1.0.0" - } - }, - "extsprintf": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", - "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=", - "dev": true - }, - "fancy-log": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/fancy-log/-/fancy-log-1.3.2.tgz", - "integrity": "sha1-9BEl49hPLn2JpD0G2VjI94vha+E=", - "dev": true, - "requires": { - "ansi-gray": "^0.1.1", - "color-support": "^1.1.3", - "time-stamp": "^1.0.0" - } - }, - "fast-deep-equal": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", - "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=", - "dev": true - }, - "fast-glob": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-2.2.4.tgz", - "integrity": "sha512-FjK2nCGI/McyzgNtTESqaWP3trPvHyRyoyY70hxjc3oKPNmDe8taohLZpoVKoUjW85tbU5txaYUZCNtVzygl1g==", + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", + "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", "dev": true, "requires": { - "@mrmlnc/readdir-enhanced": "^2.2.1", - "@nodelib/fs.stat": "^1.1.2", - "glob-parent": "^3.1.0", - "is-glob": "^4.0.0", - "merge2": "^1.2.3", - "micromatch": "^3.1.10" + "array-unique": "^0.3.2", + "define-property": "^1.0.0", + "expand-brackets": "^2.1.4", + "extend-shallow": "^2.0.1", + "fragment-cache": "^0.2.1", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" }, "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", + "define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", + "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", "dev": true, "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } + "is-descriptor": "^1.0.0" } }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", "dev": true, "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } + "is-extendable": "^0.1.0" } }, "is-accessor-descriptor": { @@ -4645,77 +4520,53 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } } } }, + "extsprintf": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", + "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=", + "dev": true + }, + "fancy-log": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/fancy-log/-/fancy-log-1.3.3.tgz", + "integrity": "sha512-k9oEhlyc0FrVh25qYuSELjr8oxsCoc4/LEZfg2iJJrfEk/tZL9bCoJE47gqAvI2m/AUjluCS4+3I0eTx8n3AEw==", + "dev": true, + "requires": { + "ansi-gray": "^0.1.1", + "color-support": "^1.1.3", + "parse-node-version": "^1.0.0", + "time-stamp": "^1.0.0" + } + }, + "fast-deep-equal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", + "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", + "dev": true + }, + "fast-extend": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/fast-extend/-/fast-extend-0.0.2.tgz", + "integrity": "sha1-9exCz0C5Rg9SGmOH37Ut7u1nHb0=", + "dev": true + }, + "fast-glob": { + "version": "2.2.6", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-2.2.6.tgz", + "integrity": "sha512-0BvMaZc1k9F+MeWWMe8pL6YltFzZYcJsYU7D4JyDA6PAczaXvxqQQ/z+mDF7/4Mw01DeUc+i3CTKajnkANkV4w==", + "dev": true, + "requires": { + "@mrmlnc/readdir-enhanced": "^2.2.1", + "@nodelib/fs.stat": "^1.1.2", + "glob-parent": "^3.1.0", + "is-glob": "^4.0.0", + "merge2": "^1.2.3", + "micromatch": "^3.1.10" + } + }, "fast-json-stable-stringify": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.0.0.tgz", @@ -4769,16 +4620,26 @@ } }, "fill-range": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.4.tgz", - "integrity": "sha512-cnrcCbj01+j2gTG921VZPnHbjmdAf8oQV/iGeV2kZxGSyfYjjTyY79ErsK1WJWMpw6DaApEX72binqJE+/d+5Q==", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", + "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", "dev": true, "requires": { - "is-number": "^2.1.0", - "isobject": "^2.0.0", - "randomatic": "^3.0.0", - "repeat-element": "^1.1.2", - "repeat-string": "^1.5.2" + "extend-shallow": "^2.0.1", + "is-number": "^3.0.0", + "repeat-string": "^1.6.1", + "to-regex-range": "^2.1.0" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } } }, "find-cache-dir": { @@ -4812,9 +4673,9 @@ } }, "p-limit": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.0.0.tgz", - "integrity": "sha512-fl5s52lI5ahKCernzzIyAP0QAZbGIovtVHGwpcu1Jr/EpzLVDI2myISHwGqK7m8uQFugVWSrbxH7XnhGtvEc+A==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.1.0.tgz", + "integrity": "sha512-NhURkNcrVB+8hNfLuysU8enY5xn2KXphsHBaC2YmRNTZRc7RWusw6apSpdEj3jo4CMb6W9nrF6tTnsJsJeyu6g==", "dev": true, "requires": { "p-try": "^2.0.0" @@ -4835,6 +4696,12 @@ "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", "dev": true }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + }, "pkg-dir": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-3.0.0.tgz", @@ -4846,16 +4713,10 @@ } } }, - "find-index": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/find-index/-/find-index-0.1.1.tgz", - "integrity": "sha1-Z101iyyjiS15Whq0cjL4tuLg3eQ=", - "dev": true - }, - "find-parent-dir": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/find-parent-dir/-/find-parent-dir-0.3.0.tgz", - "integrity": "sha1-M8RLQpqysvBkYpnF+fcY83b/jVQ=", + "find-npm-prefix": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/find-npm-prefix/-/find-npm-prefix-1.0.2.tgz", + "integrity": "sha512-KEftzJ+H90x6pcKtdXZEPsQse8/y/UnvzRKrOSQFprnrGaFuJ62fVkP34Iu2IYuMvyauCyoLTNkJZgrrGA2wkA==", "dev": true }, "find-replace": { @@ -4868,12 +4729,13 @@ } }, "find-up": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", - "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-1.1.2.tgz", + "integrity": "sha1-ay6YIrGizgpgq2TWEOzK1TyyTQ8=", "dev": true, "requires": { - "locate-path": "^2.0.0" + "path-exists": "^2.0.0", + "pinkie-promise": "^2.0.0" } }, "findup-sync": { @@ -4888,233 +4750,6 @@ "resolve-dir": "^1.0.1" }, "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, "is-glob": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", @@ -5123,66 +4758,13 @@ "requires": { "is-extglob": "^2.1.0" } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } } } }, "fined": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/fined/-/fined-1.1.0.tgz", - "integrity": "sha1-s33IRLdqL15wgeiE98CuNE8VNHY=", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/fined/-/fined-1.1.1.tgz", + "integrity": "sha512-jQp949ZmEbiYHk3gkbdtpJ0G1+kgtLQBNdP5edFP7Fh+WAYceLQz6yO1SBj72Xkg8GVyTB3bBzAYrHJVh5Xd5g==", "dev": true, "requires": { "expand-tilde": "^2.0.2", @@ -5193,9 +4775,9 @@ } }, "flagged-respawn": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/flagged-respawn/-/flagged-respawn-1.0.0.tgz", - "integrity": "sha1-Tnmumy6zi/hrO7Vr8+ClaqX8q9c=", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/flagged-respawn/-/flagged-respawn-1.0.1.tgz", + "integrity": "sha512-lNaHNVymajmk0OJMBn8fVUAU1BtDeKIqKoVhk4xAALB57aALg6b4W0MfJ/cUE0g9YBXy5XhSlPIpYIJ7HaY/3Q==", "dev": true }, "flatbuffers": { @@ -5220,9 +4802,9 @@ "dev": true }, "for-own": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", - "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", + "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", "dev": true, "requires": { "for-in": "^1.0.1" @@ -5294,6 +4876,23 @@ "through2": "^2.0.3" } }, + "fs-monkey": { + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/fs-monkey/-/fs-monkey-0.3.3.tgz", + "integrity": "sha512-FNUvuTAJ3CqCQb5ELn+qCbGR/Zllhf2HtwsdAtBi59s1WeCjKMT81fHcSu7dwIskqGVK+MmOrb7VOBlq3/SItw==", + "dev": true + }, + "fs-vacuum": { + "version": "1.2.10", + "resolved": "https://registry.npmjs.org/fs-vacuum/-/fs-vacuum-1.2.10.tgz", + "integrity": "sha1-t2Kb7AekAxolSP35n17PHMizHjY=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "path-is-inside": "^1.0.1", + "rimraf": "^2.5.2" + } + }, "fs-write-stream-atomic": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/fs-write-stream-atomic/-/fs-write-stream-atomic-1.0.10.tgz", @@ -5313,9 +4912,9 @@ "dev": true }, "fsevents": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-1.2.4.tgz", - "integrity": "sha512-z8H8/diyk76B7q5wg+Ud0+CqzcAF3mBBI/bA5ne5zrRUUIvNkJY//D3BqyH571KuAC4Nr7Rw7CjWX4r0y9DvNg==", + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-1.2.7.tgz", + "integrity": "sha512-Pxm6sI2MeBD7RdD12RYsqaP0nMiwx8eZBXCa6z2L+mRHm2DYrOYwihmhjpkdjUHwQhslWQjRpEgNq4XvBmaAuw==", "dev": true, "optional": true, "requires": { @@ -5341,7 +4940,7 @@ "optional": true }, "are-we-there-yet": { - "version": "1.1.4", + "version": "1.1.5", "bundled": true, "dev": true, "optional": true, @@ -5365,7 +4964,7 @@ } }, "chownr": { - "version": "1.0.1", + "version": "1.1.1", "bundled": true, "dev": true, "optional": true @@ -5401,7 +5000,7 @@ } }, "deep-extend": { - "version": "0.5.1", + "version": "0.6.0", "bundled": true, "dev": true, "optional": true @@ -5450,7 +5049,7 @@ } }, "glob": { - "version": "7.1.2", + "version": "7.1.3", "bundled": true, "dev": true, "optional": true, @@ -5470,12 +5069,12 @@ "optional": true }, "iconv-lite": { - "version": "0.4.21", + "version": "0.4.24", "bundled": true, "dev": true, "optional": true, "requires": { - "safer-buffer": "^2.1.0" + "safer-buffer": ">= 2.1.2 < 3" } }, "ignore-walk": { @@ -5536,16 +5135,16 @@ "dev": true }, "minipass": { - "version": "2.2.4", + "version": "2.3.5", "bundled": true, "dev": true, "requires": { - "safe-buffer": "^5.1.1", + "safe-buffer": "^5.1.2", "yallist": "^3.0.0" } }, "minizlib": { - "version": "1.1.0", + "version": "1.2.1", "bundled": true, "dev": true, "optional": true, @@ -5568,7 +5167,7 @@ "optional": true }, "needle": { - "version": "2.2.0", + "version": "2.2.4", "bundled": true, "dev": true, "optional": true, @@ -5579,18 +5178,18 @@ } }, "node-pre-gyp": { - "version": "0.10.0", + "version": "0.10.3", "bundled": true, "dev": true, "optional": true, "requires": { "detect-libc": "^1.0.2", "mkdirp": "^0.5.1", - "needle": "^2.2.0", + "needle": "^2.2.1", "nopt": "^4.0.1", "npm-packlist": "^1.1.6", "npmlog": "^4.0.2", - "rc": "^1.1.7", + "rc": "^1.2.7", "rimraf": "^2.6.1", "semver": "^5.3.0", "tar": "^4" @@ -5607,13 +5206,13 @@ } }, "npm-bundled": { - "version": "1.0.3", + "version": "1.0.5", "bundled": true, "dev": true, "optional": true }, "npm-packlist": { - "version": "1.1.10", + "version": "1.2.0", "bundled": true, "dev": true, "optional": true, @@ -5688,12 +5287,12 @@ "optional": true }, "rc": { - "version": "1.2.7", + "version": "1.2.8", "bundled": true, "dev": true, "optional": true, "requires": { - "deep-extend": "^0.5.1", + "deep-extend": "^0.6.0", "ini": "~1.3.0", "minimist": "^1.2.0", "strip-json-comments": "~2.0.1" @@ -5723,16 +5322,16 @@ } }, "rimraf": { - "version": "2.6.2", + "version": "2.6.3", "bundled": true, "dev": true, "optional": true, "requires": { - "glob": "^7.0.5" + "glob": "^7.1.3" } }, "safe-buffer": { - "version": "5.1.1", + "version": "5.1.2", "bundled": true, "dev": true }, @@ -5749,7 +5348,7 @@ "optional": true }, "semver": { - "version": "5.5.0", + "version": "5.6.0", "bundled": true, "dev": true, "optional": true @@ -5800,17 +5399,17 @@ "optional": true }, "tar": { - "version": "4.4.1", + "version": "4.4.8", "bundled": true, "dev": true, "optional": true, "requires": { - "chownr": "^1.0.1", + "chownr": "^1.1.1", "fs-minipass": "^1.2.5", - "minipass": "^2.2.4", - "minizlib": "^1.1.0", + "minipass": "^2.3.4", + "minizlib": "^1.1.1", "mkdirp": "^0.5.0", - "safe-buffer": "^5.1.1", + "safe-buffer": "^5.1.2", "yallist": "^3.0.2" } }, @@ -5821,12 +5420,12 @@ "optional": true }, "wide-align": { - "version": "1.1.2", + "version": "1.1.3", "bundled": true, "dev": true, "optional": true, "requires": { - "string-width": "^1.0.2" + "string-width": "^1.0.2 || 2" } }, "wrappy": { @@ -5835,7 +5434,7 @@ "dev": true }, "yallist": { - "version": "3.0.2", + "version": "3.0.3", "bundled": true, "dev": true } @@ -5881,16 +5480,26 @@ "integrity": "sha512-KGDOARWVga7+rnB3z9Sd2Letx515owfk0hSxHGuqjANb1M+x2bGZGqHLiozPsYMdM2OubeMni/Hpwmjq6qIUhA==", "dev": true }, - "get-caller-file": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.3.tgz", - "integrity": "sha512-3t6rVToeoZfYSGd8YoLFR2DJkiQrIiUrGcjvFX2mDw3bn6k2OtwHN0TNCLbBO+w8qTvimhDkv+LSscbJY1vE6w==", - "dev": true - }, - "get-own-enumerable-property-symbols": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/get-own-enumerable-property-symbols/-/get-own-enumerable-property-symbols-3.0.0.tgz", - "integrity": "sha512-CIJYJC4GGF06TakLg8z4GQKvDsx9EMspVxOYih7LerEL/WosUnFIww45CGfxfeKHqlg3twgUrYRT1O3WQqjGCg==", + "gentle-fs": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/gentle-fs/-/gentle-fs-2.0.1.tgz", + "integrity": "sha512-cEng5+3fuARewXktTEGbwsktcldA+YsnUEaXZwcK/3pjSE1X9ObnTs+/8rYf8s+RnIcQm2D5x3rwpN7Zom8Bew==", + "dev": true, + "requires": { + "aproba": "^1.1.2", + "fs-vacuum": "^1.2.10", + "graceful-fs": "^4.1.11", + "iferr": "^0.1.5", + "mkdirp": "^0.5.1", + "path-is-inside": "^1.0.2", + "read-cmd-shim": "^1.0.1", + "slide": "^1.1.6" + } + }, + "get-caller-file": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.3.tgz", + "integrity": "sha512-3t6rVToeoZfYSGd8YoLFR2DJkiQrIiUrGcjvFX2mDw3bn6k2OtwHN0TNCLbBO+w8qTvimhDkv+LSscbJY1vE6w==", "dev": true }, "get-pkg-repo": { @@ -5914,7 +5523,7 @@ }, "camelcase-keys": { "version": "2.1.0", - "resolved": "http://registry.npmjs.org/camelcase-keys/-/camelcase-keys-2.1.0.tgz", + "resolved": "https://registry.npmjs.org/camelcase-keys/-/camelcase-keys-2.1.0.tgz", "integrity": "sha1-MIvur/3ygRkFHvodkyITyRuPkuc=", "dev": true, "requires": { @@ -5939,7 +5548,7 @@ }, "meow": { "version": "3.7.0", - "resolved": "http://registry.npmjs.org/meow/-/meow-3.7.0.tgz", + "resolved": "https://registry.npmjs.org/meow/-/meow-3.7.0.tgz", "integrity": "sha1-cstmi0JSKCkKu/qFaJJYcwioAfs=", "dev": true, "requires": { @@ -5996,7 +5605,7 @@ }, "get-stream": { "version": "3.0.0", - "resolved": "http://registry.npmjs.org/get-stream/-/get-stream-3.0.0.tgz", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-3.0.0.tgz", "integrity": "sha1-jpQ9E1jcN1VQVOy+LtsFqhdO3hQ=", "dev": true }, @@ -6026,27 +5635,6 @@ "meow": "^4.0.0", "split2": "^2.0.0", "through2": "^2.0.0" - }, - "dependencies": { - "lodash.template": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-4.4.0.tgz", - "integrity": "sha1-5zoDhcg1VZF0bgILmWecaQ5o+6A=", - "dev": true, - "requires": { - "lodash._reinterpolate": "~3.0.0", - "lodash.templatesettings": "^4.0.0" - } - }, - "lodash.templatesettings": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-4.1.0.tgz", - "integrity": "sha1-K01OlbpEDZFf8IvImeRVNmZxMxY=", - "dev": true, - "requires": { - "lodash._reinterpolate": "~3.0.0" - } - } } }, "git-remote-origin-url": { @@ -6061,7 +5649,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } @@ -6108,15 +5696,53 @@ "requires": { "glob-parent": "^2.0.0", "is-glob": "^2.0.0" + }, + "dependencies": { + "glob-parent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", + "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", + "dev": true, + "requires": { + "is-glob": "^2.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + } } }, "glob-parent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", - "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", + "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", "dev": true, "requires": { - "is-glob": "^2.0.0" + "is-glob": "^3.1.0", + "path-dirname": "^1.0.0" + }, + "dependencies": { + "is-glob": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", + "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", + "dev": true, + "requires": { + "is-extglob": "^2.1.0" + } + } } }, "glob-stream": { @@ -6135,33 +5761,6 @@ "remove-trailing-separator": "^1.0.1", "to-absolute-glob": "^2.0.0", "unique-stream": "^2.0.2" - }, - "dependencies": { - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } } }, "glob-to-regexp": { @@ -6182,358 +5781,6 @@ "is-negated-glob": "^1.0.0", "just-debounce": "^1.0.0", "object.defaults": "^1.1.0" - }, - "dependencies": { - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "chokidar": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", - "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", - "dev": true, - "requires": { - "anymatch": "^2.0.0", - "async-each": "^1.0.0", - "braces": "^2.3.0", - "fsevents": "^1.2.2", - "glob-parent": "^3.1.0", - "inherits": "^2.0.1", - "is-binary-path": "^1.0.0", - "is-glob": "^4.0.0", - "lodash.debounce": "^4.0.8", - "normalize-path": "^2.1.1", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0", - "upath": "^1.0.5" - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } - } - }, - "glob2base": { - "version": "0.0.12", - "resolved": "https://registry.npmjs.org/glob2base/-/glob2base-0.0.12.tgz", - "integrity": "sha1-nUGbPijxLoOjYhZKJ3BVkiycDVY=", - "dev": true, - "requires": { - "find-index": "^0.1.1" } }, "global-modules": { @@ -6581,30 +5828,32 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } } }, "glogg": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/glogg/-/glogg-1.0.1.tgz", - "integrity": "sha512-ynYqXLoluBKf9XGR1gA59yEJisIL7YHEH4xr3ZziHB5/yl4qWfaK8Js9jGe6gBGCSCKVqiyO30WnRZADvemUNw==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/glogg/-/glogg-1.0.2.tgz", + "integrity": "sha512-5mwUoSuBk44Y4EshyiqcH95ZntbDdTQqA3QYSrxmzj28Ai0vXBGMH1ApSANH14j2sIRtqCEyg6PfsuP7ElOEDA==", "dev": true, "requires": { "sparkles": "^1.0.0" } }, "google-closure-compiler": { - "version": "20181008.0.0", - "resolved": "https://registry.npmjs.org/google-closure-compiler/-/google-closure-compiler-20181008.0.0.tgz", - "integrity": "sha512-XmJIasXHyy4kirthlsuDev2LZcXjYXWfOHwHdCLUQnfJH8T2sxWDNjFLQycaCIXwQLOyw2Kem38VgxrYfG0hzg==", + "version": "20190121.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler/-/google-closure-compiler-20190121.0.0.tgz", + "integrity": "sha512-FIp3+KxjtDwykDTr1WsFo0QexEopAC4bDXXZfnEdgHECF7hCeFAAsLUPxMmj9Wx+O39eFCXGAzY7w0k5aU9qjg==", "dev": true, "requires": { "chalk": "^1.0.0", - "google-closure-compiler-linux": "^20181008.0.0", - "google-closure-compiler-osx": "^20181008.0.0", + "google-closure-compiler-java": "^20190121.0.0", + "google-closure-compiler-js": "^20190121.0.0", + "google-closure-compiler-linux": "^20190121.0.0", + "google-closure-compiler-osx": "^20190121.0.0", "minimist": "^1.2.0", "vinyl": "^2.0.1", "vinyl-sourcemaps-apply": "^0.2.0" @@ -6618,7 +5867,7 @@ }, "chalk": { "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", "dev": true, "requires": { @@ -6637,24 +5886,36 @@ } } }, + "google-closure-compiler-java": { + "version": "20190121.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-java/-/google-closure-compiler-java-20190121.0.0.tgz", + "integrity": "sha512-UCQ7ZXOlk/g101DS4TqyW+SaoR+4GVq7NKrwebH4gnESY76Xuz7FRrKWwfAXwltmiYAUVZCVI4qpoEz48V+VjA==", + "dev": true + }, + "google-closure-compiler-js": { + "version": "20190121.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-js/-/google-closure-compiler-js-20190121.0.0.tgz", + "integrity": "sha512-PgY0Fy+fXZnjir6aPz/FVJPXuwZf5pKJ9n7Hf1HL4x1lhqVIf3i+u3Ed6ZWCXa+YiEhvwH5RTQr/iPP/D3gDRg==", + "dev": true + }, "google-closure-compiler-linux": { - "version": "20181008.0.0", - "resolved": "https://registry.npmjs.org/google-closure-compiler-linux/-/google-closure-compiler-linux-20181008.0.0.tgz", - "integrity": "sha512-k8njGfH2uzWJiRPPvUxM7MJB28gPrf4kI2bbuiF0gJk/1arXcWCPGjLD6pzCU0UylMy52MUXLgsIpRorqf2brw==", + "version": "20190121.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-linux/-/google-closure-compiler-linux-20190121.0.0.tgz", + "integrity": "sha512-cw4qr9TuB2gB53l/oYadZLuw+zOi2yggYFtnNA5jvTLTqY8m2VZAL5DGL6gmCtZovbQ0bv9ANqjT8NxEtcSzfw==", "dev": true, "optional": true }, "google-closure-compiler-osx": { - "version": "20181008.0.0", - "resolved": "https://registry.npmjs.org/google-closure-compiler-osx/-/google-closure-compiler-osx-20181008.0.0.tgz", - "integrity": "sha512-xzf/yH/4MXdb6GbP84iHnpcVCOPBbH0gMVOs0JhR/KbrQh+DlJU+Y8Z/DQzTkw9HgD650R2/WZmBknURyg9OTw==", + "version": "20190121.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-osx/-/google-closure-compiler-osx-20190121.0.0.tgz", + "integrity": "sha512-6OqyUcgojPCqCuzdyKLwmIkBhfoWF3cVzaX8vaJvQ3SYwlITBT3aepMEZiWFRVvvml+ojs1AJcZvQIqFke8X1w==", "dev": true, "optional": true }, "graceful-fs": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz", - "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=", + "version": "4.1.15", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.15.tgz", + "integrity": "sha512-6uHUhOPEBgQ24HM+r6b/QwWfZq+yiFcipKFrOFiBEnWdy5sdzYoi+pJeQaPI5qOLRFqWmAXUPQNsielzdLoecA==", "dev": true }, "growl": { @@ -6681,15 +5942,6 @@ "vinyl-fs": "^3.0.0" }, "dependencies": { - "ansi-colors": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/ansi-colors/-/ansi-colors-1.1.0.tgz", - "integrity": "sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==", - "dev": true, - "requires": { - "ansi-wrap": "^0.1.0" - } - }, "gulp-cli": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/gulp-cli/-/gulp-cli-2.0.1.tgz", @@ -6715,24 +5967,21 @@ "v8flags": "^3.0.1", "yargs": "^7.1.0" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true } } }, "gulp-json-transform": { - "version": "0.4.5", - "resolved": "https://registry.npmjs.org/gulp-json-transform/-/gulp-json-transform-0.4.5.tgz", - "integrity": "sha512-kaGUaAhgjxeLgIMNF3IPFFmYCF6AgvzBQwqmVowiIStNADZSoILtPNDisYA4mKfpwMTqSiWLogQt1q5U75+uwA==", + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/gulp-json-transform/-/gulp-json-transform-0.4.6.tgz", + "integrity": "sha512-laPoNiJP/+lAeiyb0lgY3cynOOi7R/QbPvKBEXJY6bm836nYg90pwY4mgwR7w8nFDlXiCToUeaoQCBIc2NudjA==", "dev": true, "requires": { - "gulp-util": "^3.0.8", + "ansi-colors": "^1.0.1", + "fancy-log": "^1.3.2", + "plugin-error": "^1.0.1", "promise": "^8.0.1", - "through2": "^2.0.3" + "through2": "^2.0.3", + "vinyl": "^2.1.0" } }, "gulp-rename": { @@ -6769,204 +6018,39 @@ } }, "gulp-typescript": { - "version": "5.0.0-alpha.3", - "resolved": "https://registry.npmjs.org/gulp-typescript/-/gulp-typescript-5.0.0-alpha.3.tgz", - "integrity": "sha512-6iSBjqBXAUqRsLUh/9XtlOnSzpPMbLrr5rqGj4UPLtGpDwFHW/fVTuRgv6LAWiKesLIUDDM0ourxvcpu2trecQ==", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/gulp-typescript/-/gulp-typescript-5.0.0.tgz", + "integrity": "sha512-lMj2U+Ni6HyFaY2nr1sSQ6D014eHil5L1i52XWBaAQUR9UAUUp9btnm4yRBT2Jb8xhrwqmhMssZf/g2B7cinCA==", "dev": true, "requires": { - "ansi-colors": "^2.0.2", + "ansi-colors": "^3.0.5", "plugin-error": "^1.0.1", "source-map": "^0.7.3", - "through2": "^2.0.3", + "through2": "^3.0.0", "vinyl": "^2.1.0", "vinyl-fs": "^3.0.3" }, "dependencies": { - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - } - }, - "glob-stream": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/glob-stream/-/glob-stream-6.1.0.tgz", - "integrity": "sha1-cEXJlBOz65SIjYOrRtC0BMx73eQ=", - "dev": true, - "requires": { - "extend": "^3.0.0", - "glob": "^7.1.1", - "glob-parent": "^3.1.0", - "is-negated-glob": "^1.0.0", - "ordered-read-streams": "^1.0.0", - "pumpify": "^1.3.5", - "readable-stream": "^2.1.5", - "remove-trailing-separator": "^1.0.1", - "to-absolute-glob": "^2.0.0", - "unique-stream": "^2.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - }, - "is-valid-glob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-valid-glob/-/is-valid-glob-1.0.0.tgz", - "integrity": "sha1-Kb8+/3Ab4tTTFdusw5vDn+j2Aao=", + "ansi-colors": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-3.2.3.tgz", + "integrity": "sha512-LEHHyuhlPY3TmuUYMh2oz89lTShfvgbmzaBcxve9t/9Wuy7Dwf4yoAKcND7KFT1HAQfqZ12qtc+DUrBMeKF9nw==", "dev": true }, - "ordered-read-streams": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/ordered-read-streams/-/ordered-read-streams-1.0.1.tgz", - "integrity": "sha1-d8DLN8QVJdZBZtmQ/61+xqDhNj4=", - "dev": true, - "requires": { - "readable-stream": "^2.0.1" - } - }, "source-map": { "version": "0.7.3", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.3.tgz", "integrity": "sha512-CkCj6giN3S+n9qrYiBTX5gystlENnRW5jZeNLHpe6aue+SrHcG5VYwujhW9s4dY31mEGsxBDrHR6oI69fTXsaQ==", "dev": true }, - "to-absolute-glob": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/to-absolute-glob/-/to-absolute-glob-2.0.2.tgz", - "integrity": "sha1-GGX0PZ50sIItufFFt4z/fQ98hJs=", - "dev": true, - "requires": { - "is-absolute": "^1.0.0", - "is-negated-glob": "^1.0.0" - } - }, - "vinyl-fs": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/vinyl-fs/-/vinyl-fs-3.0.3.tgz", - "integrity": "sha512-vIu34EkyNyJxmP0jscNzWBSygh7VWhqun6RmqVfXePrOwi9lhvRs//dOaGOTRUQr4tx7/zd26Tk5WeSVZitgng==", - "dev": true, - "requires": { - "fs-mkdirp-stream": "^1.0.0", - "glob-stream": "^6.1.0", - "graceful-fs": "^4.0.0", - "is-valid-glob": "^1.0.0", - "lazystream": "^1.0.0", - "lead": "^1.0.0", - "object.assign": "^4.0.4", - "pumpify": "^1.3.5", - "readable-stream": "^2.3.3", - "remove-bom-buffer": "^3.0.0", - "remove-bom-stream": "^1.2.0", - "resolve-options": "^1.1.0", - "through2": "^2.0.0", - "to-through": "^2.0.0", - "value-or-function": "^3.0.0", - "vinyl": "^2.0.0", - "vinyl-sourcemap": "^1.1.0" - } - } - } - }, - "gulp-util": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/gulp-util/-/gulp-util-3.0.8.tgz", - "integrity": "sha1-AFTh50RQLifATBh8PsxQXdVLu08=", - "dev": true, - "requires": { - "array-differ": "^1.0.0", - "array-uniq": "^1.0.2", - "beeper": "^1.0.0", - "chalk": "^1.0.0", - "dateformat": "^2.0.0", - "fancy-log": "^1.1.0", - "gulplog": "^1.0.0", - "has-gulplog": "^0.1.0", - "lodash._reescape": "^3.0.0", - "lodash._reevaluate": "^3.0.0", - "lodash._reinterpolate": "^3.0.0", - "lodash.template": "^3.0.0", - "minimist": "^1.1.0", - "multipipe": "^0.1.2", - "object-assign": "^3.0.0", - "replace-ext": "0.0.1", - "through2": "^2.0.0", - "vinyl": "^0.5.0" - }, - "dependencies": { - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", - "dev": true - }, - "chalk": { - "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "clone": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/clone/-/clone-1.0.4.tgz", - "integrity": "sha1-2jCcwmPfFZlMaIypAheco8fNfH4=", - "dev": true - }, - "clone-stats": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/clone-stats/-/clone-stats-0.0.1.tgz", - "integrity": "sha1-uI+UqCzzi4eR1YBG6kAprYjKmdE=", - "dev": true - }, - "object-assign": { + "through2": { "version": "3.0.0", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-3.0.0.tgz", - "integrity": "sha1-m+3VygiXlJvKR+f/QIBi1Un1h/I=", - "dev": true - }, - "replace-ext": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/replace-ext/-/replace-ext-0.0.1.tgz", - "integrity": "sha1-KbvZIHinOfC8zitO5B6DeVNSKSQ=", - "dev": true - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true - }, - "vinyl": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/vinyl/-/vinyl-0.5.3.tgz", - "integrity": "sha1-sEVbOPxeDPMNQyUTLkYZcMIJHN4=", + "resolved": "https://registry.npmjs.org/through2/-/through2-3.0.0.tgz", + "integrity": "sha512-8B+sevlqP4OiCjonI1Zw03Sf8PuV1eRsYQgLad5eonILOdyeRsY27A/2Ze8IlvlMvq31OH+3fz/styI7Ya62yQ==", "dev": true, "requires": { - "clone": "^1.0.0", - "clone-stats": "^0.0.1", - "replace-ext": "0.0.1" + "readable-stream": "2 || 3", + "xtend": "~4.0.1" } } } @@ -7007,12 +6091,12 @@ "dev": true }, "har-validator": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.0.tgz", - "integrity": "sha512-+qnmNjI4OfH2ipQ9VQOw23bBd/ibtfbVdK2fYbY4acTDqKTW/YDp9McimZdDbG8iV9fZizUqQMD5xvriB146TA==", + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.3.tgz", + "integrity": "sha512-sNvOCzEQNr/qrvJgc3UG/kD4QtlHycrzwS+6mfTrrSq97BvaYcPZZI1ZSqGSPR73Cxn4LKTD4PttRwfU7jWq5g==", "dev": true, "requires": { - "ajv": "^5.3.0", + "ajv": "^6.5.5", "har-schema": "^2.0.0" } }, @@ -7039,15 +6123,6 @@ "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=" }, - "has-gulplog": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/has-gulplog/-/has-gulplog-0.1.0.tgz", - "integrity": "sha1-ZBTIKRNpfaUVkDl9r7EvIpZ4Ec4=", - "dev": true, - "requires": { - "sparkles": "^1.0.0" - } - }, "has-symbols": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.0.tgz", @@ -7069,14 +6144,6 @@ "get-value": "^2.0.6", "has-values": "^1.0.0", "isobject": "^3.0.0" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "has-values": { @@ -7089,26 +6156,6 @@ "kind-of": "^4.0.0" }, "dependencies": { - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, "kind-of": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz", @@ -7307,6 +6354,16 @@ "minimatch": "^3.0.4" } }, + "import-fresh": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-2.0.0.tgz", + "integrity": "sha1-2BNVwVYS04bGH53dOSLUMEgipUY=", + "dev": true, + "requires": { + "caller-path": "^2.0.0", + "resolve-from": "^3.0.0" + } + }, "import-local": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/import-local/-/import-local-1.0.0.tgz", @@ -7456,9 +6513,9 @@ } }, "interpret": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.1.0.tgz", - "integrity": "sha1-ftGxQQxqDg94z5XTuEQMY/eLhhQ=", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.2.0.tgz", + "integrity": "sha512-mT34yGKMNceBQUoVn7iCDKDntA7SC6gycMAWzGx1z/CMCTV7b2AAtXlo3nRyHZ1FelRkQbQjprHSYGwzLtkVbw==", "dev": true }, "invariant": { @@ -7499,6 +6556,17 @@ "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "is-arrayish": { @@ -7524,7 +6592,7 @@ }, "is-builtin-module": { "version": "1.0.0", - "resolved": "http://registry.npmjs.org/is-builtin-module/-/is-builtin-module-1.0.0.tgz", + "resolved": "https://registry.npmjs.org/is-builtin-module/-/is-builtin-module-1.0.0.tgz", "integrity": "sha1-VAVy0096wxGfj3bDDLwbHgN6/74=", "dev": true, "requires": { @@ -7553,6 +6621,17 @@ "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "is-date-object": { @@ -7608,9 +6687,9 @@ "dev": true }, "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", "dev": true }, "is-finite": { @@ -7638,12 +6717,12 @@ "dev": true }, "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", + "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", "dev": true, "requires": { - "is-extglob": "^1.0.0" + "is-extglob": "^2.1.1" } }, "is-negated-glob": { @@ -7653,29 +6732,31 @@ "dev": true }, "is-number": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", - "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", + "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "is-obj": { "version": "1.0.1", - "resolved": "http://registry.npmjs.org/is-obj/-/is-obj-1.0.1.tgz", + "resolved": "https://registry.npmjs.org/is-obj/-/is-obj-1.0.1.tgz", "integrity": "sha1-PkcprB9f3gJc19g6iW2rn09n2w8=", "dev": true }, - "is-observable": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/is-observable/-/is-observable-1.1.0.tgz", - "integrity": "sha512-NqCa4Sa2d+u7BWc6CukaObG3Fh+CU9bvixbpcXYhy2VvYS7vVGIdAgnIS5Ks3A/cqk4rebLJ9s8zBstT2aKnIA==", - "dev": true, - "requires": { - "symbol-observable": "^1.1.0" - } - }, "is-path-cwd": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/is-path-cwd/-/is-path-cwd-1.0.0.tgz", @@ -7713,14 +6794,6 @@ "dev": true, "requires": { "isobject": "^3.0.1" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "is-posix-bracket": { @@ -7750,12 +6823,6 @@ "has": "^1.0.1" } }, - "is-regexp": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-regexp/-/is-regexp-1.0.0.tgz", - "integrity": "sha1-/S2INUXEa6xaYz57mgnof6LLUGk=", - "dev": true - }, "is-relative": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/is-relative/-/is-relative-1.0.0.tgz", @@ -7841,13 +6908,10 @@ "dev": true }, "isobject": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", - "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", - "dev": true, - "requires": { - "isarray": "1.0.0" - } + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", + "dev": true }, "isstream": { "version": "0.1.2", @@ -7973,12 +7037,14 @@ } }, "ix": { - "version": "2.3.5", - "resolved": "https://registry.npmjs.org/ix/-/ix-2.3.5.tgz", - "integrity": "sha512-mdW2LtQiy+gPtggKa393EdSaI46RARsAa5zjlLgNKMlE57vC6dc6g6nehROI1Gj/HhsTvpb3WALSwg0EWhhz0Q==", + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/ix/-/ix-2.5.1.tgz", + "integrity": "sha512-YPX759NbhmIynoCYsxcpKBCQDFkeVup4xGaAylnIRaM+md7qrLyoW7kow0iqx4cJr8PUG85/cfwfjylqehg8bQ==", "dev": true, "requires": { - "tslib": "^1.8.0" + "@types/node": "^10.12.18", + "is-stream": "1.1.0", + "tslib": "^1.9.3" } }, "jest": { @@ -7997,6 +7063,38 @@ "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", "dev": true }, + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "dev": true + }, "cliui": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", @@ -8008,12 +7106,54 @@ "wrap-ansi": "^2.0.0" } }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, "is-fullwidth-code-point": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "dev": true }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, "jest-cli": { "version": "23.6.0", "resolved": "https://registry.npmjs.org/jest-cli/-/jest-cli-23.6.0.tgz", @@ -8058,6 +7198,36 @@ "yargs": "^11.0.0" } }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + }, "os-locale": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", @@ -8088,9 +7258,15 @@ "ansi-regex": "^3.0.0" } }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, "yargs": { "version": "11.1.0", - "resolved": "http://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", "integrity": "sha512-NwW69J42EsCSanF8kyn5upxvjp5ds+t3+udGBeTbFnERA+lF541DDpMawzo4z6W/QrzNM18D+BPMiOBibnFV5A==", "dev": true, "requires": { @@ -8107,6 +7283,15 @@ "y18n": "^3.2.1", "yargs-parser": "^9.0.2" } + }, + "yargs-parser": { + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-9.0.2.tgz", + "integrity": "sha1-nM9qQ0YP5O1Aqbto9I1DuKaMwHc=", + "dev": true, + "requires": { + "camelcase": "^4.1.0" + } } } }, @@ -8139,11 +7324,102 @@ "jest-validate": "^23.6.0", "micromatch": "^2.3.11", "pretty-format": "^23.6.0" - } - }, - "jest-diff": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-23.6.0.tgz", + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } + } + }, + "jest-diff": { + "version": "23.6.0", + "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-23.6.0.tgz", "integrity": "sha512-Gz9l5Ov+X3aL5L37IT+8hoCUsof1CVYBb2QEkOupK64XyRR3h+uRpYIm97K7sY8diFxowR8pIGEdyfMKTixo3g==", "dev": true, "requires": { @@ -8201,7 +7477,7 @@ }, "jest-get-type": { "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-get-type/-/jest-get-type-22.4.3.tgz", + "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-22.4.3.tgz", "integrity": "sha512-/jsz0Y+V29w1chdXVygEKSz2nBoHoYqNShPe+QgxSNjAuP1i8+k4LbQNrfoliKej0P45sivkSCh7yiD6ubHS3w==", "dev": true }, @@ -8219,6 +7495,97 @@ "jest-worker": "^23.2.0", "micromatch": "^2.3.11", "sane": "^2.0.0" + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } } }, "jest-jasmine2": { @@ -8272,6 +7639,97 @@ "micromatch": "^2.3.11", "slash": "^1.0.0", "stack-utils": "^1.0.1" + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } } }, "jest-mock": { @@ -8335,9 +7793,9 @@ "dev": true }, "source-map-support": { - "version": "0.5.9", - "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.9.tgz", - "integrity": "sha512-gR6Rw4MvUlYy83vP0vxoVNzM6t8MUXqNuRsuBmBHQDu1Fh6X015FrLdgoDKcNdkwGubozq0P4N0Q37UyFVr1EA==", + "version": "0.5.10", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.10.tgz", + "integrity": "sha512-YfQ3tQFTK/yzlGJuX8pTwa4tifQj4QS2Mj7UegOu8jAz59MqIiMGPXxQhVQiIMNzayuUSF/jEuVnfFF5JqybmQ==", "dev": true, "requires": { "buffer-from": "^1.0.0", @@ -8381,6 +7839,38 @@ "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", "dev": true }, + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "dev": true + }, "cliui": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", @@ -8392,12 +7882,84 @@ "wrap-ansi": "^2.0.0" } }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, "is-fullwidth-code-point": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "dev": true }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + }, "os-locale": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", @@ -8434,9 +7996,15 @@ "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", "dev": true }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, "yargs": { "version": "11.1.0", - "resolved": "http://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", "integrity": "sha512-NwW69J42EsCSanF8kyn5upxvjp5ds+t3+udGBeTbFnERA+lF541DDpMawzo4z6W/QrzNM18D+BPMiOBibnFV5A==", "dev": true, "requires": { @@ -8453,6 +8021,15 @@ "y18n": "^3.2.1", "yargs-parser": "^9.0.2" } + }, + "yargs-parser": { + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-9.0.2.tgz", + "integrity": "sha1-nM9qQ0YP5O1Aqbto9I1DuKaMwHc=", + "dev": true, + "requires": { + "camelcase": "^4.1.0" + } } } }, @@ -8462,6 +8039,16 @@ "integrity": "sha1-o3dq6zEekP6D+rnlM+hRAr0WQWU=", "dev": true }, + "jest-silent-reporter": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/jest-silent-reporter/-/jest-silent-reporter-0.1.1.tgz", + "integrity": "sha512-nrRzOV4151hG354tnVWfyZbFGJdylpadRWYWWPSD+WeOz2hQOjUGxvIFODnaY9cKQ7JWCtG+5LgSss22ccRhBg==", + "dev": true, + "requires": { + "chalk": "^2.3.1", + "jest-util": "^23.0.0" + } + }, "jest-snapshot": { "version": "23.6.0", "resolved": "https://registry.npmjs.org/jest-snapshot/-/jest-snapshot-23.6.0.tgz", @@ -8543,9 +8130,9 @@ "dev": true }, "js-yaml": { - "version": "3.12.0", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.0.tgz", - "integrity": "sha512-PIt2cnwmPfL4hKNwqeiuz4bKfnzHTBv6HyVgjahA6mPLwPDzjDWrplJBMjHUFxku/N3FlmrbyPclad+I+4mJ3A==", + "version": "3.12.1", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.1.tgz", + "integrity": "sha512-um46hB9wNOKlwkHgiuyEVAybXBjwFUV0Z/RaHJblRd9DXltue9FTYvzCr9ErQrK9Adz5MU4gHWVaNUfdmrC8qA==", "dev": true, "requires": { "argparse": "^1.0.7", @@ -8622,19 +8209,16 @@ "dev": true }, "json-schema-traverse": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.3.1.tgz", - "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A=", + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", "dev": true }, - "json-stable-stringify": { + "json-stable-stringify-without-jsonify": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz", - "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=", - "dev": true, - "requires": { - "jsonify": "~0.0.0" - } + "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", + "integrity": "sha1-nbe1lJatPzz+8wp1FC0tkwrXJlE=", + "dev": true }, "json-stringify-safe": { "version": "5.0.1", @@ -8644,7 +8228,7 @@ }, "json5": { "version": "0.5.1", - "resolved": "http://registry.npmjs.org/json5/-/json5-0.5.1.tgz", + "resolved": "https://registry.npmjs.org/json5/-/json5-0.5.1.tgz", "integrity": "sha1-Hq3nrMASA0rYTiOWdn6tn6VJWCE=", "dev": true }, @@ -8688,13 +8272,10 @@ "dev": true }, "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true }, "klaw": { "version": "1.3.1", @@ -8761,28 +8342,28 @@ "dev": true }, "lerna": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/lerna/-/lerna-3.4.3.tgz", - "integrity": "sha512-tWq1LvpHqkyB+FaJCmkEweivr88yShDMmauofPVdh0M5gU1cVucszYnIgWafulKYu2LMQ3IfUMUU5Pp3+MvADQ==", - "dev": true, - "requires": { - "@lerna/add": "^3.4.1", - "@lerna/bootstrap": "^3.4.1", - "@lerna/changed": "^3.4.1", - "@lerna/clean": "^3.3.2", - "@lerna/cli": "^3.2.0", - "@lerna/create": "^3.4.1", - "@lerna/diff": "^3.3.0", - "@lerna/exec": "^3.3.2", - "@lerna/import": "^3.3.1", - "@lerna/init": "^3.3.0", - "@lerna/link": "^3.3.0", - "@lerna/list": "^3.3.2", - "@lerna/publish": "^3.4.3", - "@lerna/run": "^3.3.2", - "@lerna/version": "^3.4.1", + "version": "3.10.7", + "resolved": "https://registry.npmjs.org/lerna/-/lerna-3.10.7.tgz", + "integrity": "sha512-ha/dehl/L3Nw0pbdir5z6Hrv2oYBg5ym2fTcuk8HCLe7Zdb/ylIHdrgW8CU9eTVZkwr4et8RdVtxFA/+xa65/Q==", + "dev": true, + "requires": { + "@lerna/add": "3.10.6", + "@lerna/bootstrap": "3.10.6", + "@lerna/changed": "3.10.6", + "@lerna/clean": "3.10.6", + "@lerna/cli": "3.10.7", + "@lerna/create": "3.10.6", + "@lerna/diff": "3.10.6", + "@lerna/exec": "3.10.6", + "@lerna/import": "3.10.6", + "@lerna/init": "3.10.6", + "@lerna/link": "3.10.6", + "@lerna/list": "3.10.6", + "@lerna/publish": "3.10.7", + "@lerna/run": "3.10.6", + "@lerna/version": "3.10.6", "import-local": "^1.0.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "leven": { @@ -8801,6 +8382,34 @@ "type-check": "~0.3.2" } }, + "libnpm": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/libnpm/-/libnpm-2.0.1.tgz", + "integrity": "sha512-qTKoxyJvpBxHZQB6k0AhSLajyXq9ZE/lUsZzuHAplr2Bpv9G+k4YuYlExYdUCeVRRGqcJt8hvkPh4tBwKoV98w==", + "dev": true, + "requires": { + "bin-links": "^1.1.2", + "bluebird": "^3.5.3", + "find-npm-prefix": "^1.0.2", + "libnpmaccess": "^3.0.1", + "libnpmconfig": "^1.2.1", + "libnpmhook": "^5.0.2", + "libnpmorg": "^1.0.0", + "libnpmpublish": "^1.1.0", + "libnpmsearch": "^2.0.0", + "libnpmteam": "^1.0.1", + "lock-verify": "^2.0.2", + "npm-lifecycle": "^2.1.0", + "npm-logical-tree": "^1.2.1", + "npm-package-arg": "^6.1.0", + "npm-profile": "^4.0.1", + "npm-registry-fetch": "^3.8.0", + "npmlog": "^4.1.2", + "pacote": "^9.2.3", + "read-package-json": "^2.0.13", + "stringify-package": "^1.0.0" + } + }, "libnpmaccess": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/libnpmaccess/-/libnpmaccess-3.0.1.tgz", @@ -8827,564 +8436,293 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, - "liftoff": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/liftoff/-/liftoff-2.5.0.tgz", - "integrity": "sha1-IAkpG7Mc6oYbvxCnwVooyvdcMew=", + "libnpmconfig": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/libnpmconfig/-/libnpmconfig-1.2.1.tgz", + "integrity": "sha512-9esX8rTQAHqarx6qeZqmGQKBNZR5OIbl/Ayr0qQDy3oXja2iFVQQI81R6GZ2a02bSNZ9p3YOGX1O6HHCb1X7kA==", "dev": true, "requires": { - "extend": "^3.0.0", - "findup-sync": "^2.0.0", - "fined": "^1.0.1", - "flagged-respawn": "^1.0.0", - "is-plain-object": "^2.0.4", - "object.map": "^1.0.0", - "rechoir": "^0.6.2", - "resolve": "^1.1.7" - } - }, - "lint-staged": { - "version": "7.3.0", - "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-7.3.0.tgz", - "integrity": "sha512-AXk40M9DAiPi7f4tdJggwuKIViUplYtVj1os1MVEteW7qOkU50EOehayCfO9TsoGK24o/EsWb41yrEgfJDDjCw==", - "dev": true, - "requires": { - "chalk": "^2.3.1", - "commander": "^2.14.1", - "cosmiconfig": "^5.0.2", - "debug": "^3.1.0", - "dedent": "^0.7.0", - "execa": "^0.9.0", - "find-parent-dir": "^0.3.0", - "is-glob": "^4.0.0", - "is-windows": "^1.0.2", - "jest-validate": "^23.5.0", - "listr": "^0.14.1", - "lodash": "^4.17.5", - "log-symbols": "^2.2.0", - "micromatch": "^3.1.8", - "npm-which": "^3.0.1", - "p-map": "^1.1.1", - "path-is-inside": "^1.0.2", - "pify": "^3.0.0", - "please-upgrade-node": "^3.0.2", - "staged-git-files": "1.1.1", - "string-argv": "^0.0.2", - "stringify-object": "^3.2.2" + "figgy-pudding": "^3.5.1", + "find-up": "^3.0.0", + "ini": "^1.3.5" }, "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", + "find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", "dev": true, "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } + "locate-path": "^3.0.0" } }, - "debug": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", - "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", "dev": true, "requires": { - "ms": "^2.1.1" + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" } }, - "execa": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/execa/-/execa-0.9.0.tgz", - "integrity": "sha512-BbUMBiX4hqiHZUA5+JujIjNb6TyAlp2D5KLheMjMluwOuzcnylDL4AxZYLLn1n2AGB49eSWwyKvvEQoRpnAtmA==", + "p-limit": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.1.0.tgz", + "integrity": "sha512-NhURkNcrVB+8hNfLuysU8enY5xn2KXphsHBaC2YmRNTZRc7RWusw6apSpdEj3jo4CMb6W9nrF6tTnsJsJeyu6g==", "dev": true, "requires": { - "cross-spawn": "^5.0.1", - "get-stream": "^3.0.0", - "is-stream": "^1.1.0", - "npm-run-path": "^2.0.0", - "p-finally": "^1.0.0", - "signal-exit": "^3.0.0", - "strip-eof": "^1.0.0" + "p-try": "^2.0.0" } }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", + "p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", "dev": true, "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - }, - "ms": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", - "dev": true - } + "p-limit": "^2.0.0" } }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } + "p-try": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.0.0.tgz", + "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", + "dev": true }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } + } + }, + "libnpmhook": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/libnpmhook/-/libnpmhook-5.0.2.tgz", + "integrity": "sha512-vLenmdFWhRfnnZiNFPNMog6CK7Ujofy2TWiM2CrpZUjBRIhHkJeDaAbJdYCT6W4lcHtyrJR8yXW8KFyq6UAp1g==", + "dev": true, + "requires": { + "aproba": "^2.0.0", + "figgy-pudding": "^3.4.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" + }, + "dependencies": { + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", + "dev": true + }, + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } + "pump": "^3.0.0" } }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "kind-of": "^6.0.0" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } + } + } + }, + "libnpmorg": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/libnpmorg/-/libnpmorg-1.0.0.tgz", + "integrity": "sha512-o+4eVJBoDGMgRwh2lJY0a8pRV2c/tQM/SxlqXezjcAg26Qe9jigYVs+Xk0vvlYDWCDhP0g74J8UwWeAgsB7gGw==", + "dev": true, + "requires": { + "aproba": "^2.0.0", + "figgy-pudding": "^3.4.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" + }, + "dependencies": { + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", + "dev": true }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "kind-of": "^6.0.0" + "pump": "^3.0.0" } }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", + } + } + }, + "libnpmpublish": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/libnpmpublish/-/libnpmpublish-1.1.1.tgz", + "integrity": "sha512-nefbvJd/wY38zdt+b9SHL6171vqBrMtZ56Gsgfd0duEKb/pB8rDT4/ObUQLrHz1tOfht1flt2zM+UGaemzAG5g==", + "dev": true, + "requires": { + "aproba": "^2.0.0", + "figgy-pudding": "^3.5.1", + "get-stream": "^4.0.0", + "lodash.clonedeep": "^4.5.0", + "normalize-package-data": "^2.4.0", + "npm-package-arg": "^6.1.0", + "npm-registry-fetch": "^3.8.0", + "semver": "^5.5.1", + "ssri": "^6.0.1" + }, + "dependencies": { + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", "dev": true }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "is-extglob": "^2.1.1" + "pump": "^3.0.0" } }, - "is-number": { + "pump": { "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "ms": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", - "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==", - "dev": true } } }, - "listr": { - "version": "0.14.2", - "resolved": "https://registry.npmjs.org/listr/-/listr-0.14.2.tgz", - "integrity": "sha512-vmaNJ1KlGuGWShHI35X/F8r9xxS0VTHh9GejVXwSN20fG5xpq3Jh4bJbnumoT6q5EDM/8/YP1z3YMtQbFmhuXw==", + "libnpmsearch": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/libnpmsearch/-/libnpmsearch-2.0.0.tgz", + "integrity": "sha512-vd+JWbTGzOSfiOc+72MU6y7WqmBXn49egCCrIXp27iE/88bX8EpG64ST1blWQI1bSMUr9l1AKPMVsqa2tS5KWA==", "dev": true, "requires": { - "@samverschueren/stream-to-observable": "^0.3.0", - "is-observable": "^1.1.0", - "is-promise": "^2.1.0", - "is-stream": "^1.1.0", - "listr-silent-renderer": "^1.1.1", - "listr-update-renderer": "^0.4.0", - "listr-verbose-renderer": "^0.4.0", - "p-map": "^1.1.1", - "rxjs": "^6.1.0" - }, - "dependencies": { - "rxjs": { - "version": "6.3.3", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-6.3.3.tgz", - "integrity": "sha512-JTWmoY9tWCs7zvIk/CvRjhjGaOd+OVBM987mxFo+OW66cGpdKjZcpmc74ES1sB//7Kl/PAe8+wEakuhG4pcgOw==", - "dev": true, - "requires": { - "tslib": "^1.9.0" - } - } - } - }, - "listr-silent-renderer": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz", - "integrity": "sha1-kktaN1cVN3C/Go4/v3S4u/P5JC4=", - "dev": true - }, - "listr-update-renderer": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/listr-update-renderer/-/listr-update-renderer-0.4.0.tgz", - "integrity": "sha1-NE2YDaLKLosUW6MFkI8yrj9MyKc=", - "dev": true, - "requires": { - "chalk": "^1.1.3", - "cli-truncate": "^0.2.1", - "elegant-spinner": "^1.0.1", - "figures": "^1.7.0", - "indent-string": "^3.0.0", - "log-symbols": "^1.0.2", - "log-update": "^1.0.2", - "strip-ansi": "^3.0.1" + "figgy-pudding": "^3.5.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" }, "dependencies": { - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", - "dev": true - }, - "chalk": { - "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "figures": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/figures/-/figures-1.7.0.tgz", - "integrity": "sha1-y+Hjr/zxzUS4DK3+0o3Hk6lwHS4=", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "escape-string-regexp": "^1.0.5", - "object-assign": "^4.1.0" + "pump": "^3.0.0" } }, - "log-symbols": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-1.0.2.tgz", - "integrity": "sha1-N2/3tY6jCGoPCfrMdGF+ylAeGhg=", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "chalk": "^1.0.0" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true } } }, - "listr-verbose-renderer": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/listr-verbose-renderer/-/listr-verbose-renderer-0.4.1.tgz", - "integrity": "sha1-ggb0z21S3cWCfl/RSYng6WWTOjU=", + "libnpmteam": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/libnpmteam/-/libnpmteam-1.0.1.tgz", + "integrity": "sha512-gDdrflKFCX7TNwOMX1snWojCoDE5LoRWcfOC0C/fqF7mBq8Uz9zWAX4B2RllYETNO7pBupBaSyBDkTAC15cAMg==", "dev": true, "requires": { - "chalk": "^1.1.3", - "cli-cursor": "^1.0.2", - "date-fns": "^1.27.2", - "figures": "^1.7.0" + "aproba": "^2.0.0", + "figgy-pudding": "^3.4.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" }, "dependencies": { - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", "dev": true }, - "chalk": { - "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "cli-cursor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-1.0.2.tgz", - "integrity": "sha1-ZNo/fValRBLll5S9Ytw1KV6PKYc=", - "dev": true, - "requires": { - "restore-cursor": "^1.0.1" - } - }, - "figures": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/figures/-/figures-1.7.0.tgz", - "integrity": "sha1-y+Hjr/zxzUS4DK3+0o3Hk6lwHS4=", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "escape-string-regexp": "^1.0.5", - "object-assign": "^4.1.0" + "pump": "^3.0.0" } }, - "onetime": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/onetime/-/onetime-1.1.0.tgz", - "integrity": "sha1-ofeDj4MUxRbwXs78vEzP4EtO14k=", - "dev": true - }, - "restore-cursor": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-1.0.1.tgz", - "integrity": "sha1-NGYfRohjJ/7SmRR5FSJS35LapUE=", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "exit-hook": "^1.0.0", - "onetime": "^1.0.0" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true } } }, + "liftoff": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/liftoff/-/liftoff-2.5.0.tgz", + "integrity": "sha1-IAkpG7Mc6oYbvxCnwVooyvdcMew=", + "dev": true, + "requires": { + "extend": "^3.0.0", + "findup-sync": "^2.0.0", + "fined": "^1.0.1", + "flagged-respawn": "^1.0.0", + "is-plain-object": "^2.0.4", + "object.map": "^1.0.0", + "rechoir": "^0.6.2", + "resolve": "^1.1.7" + } + }, "load-json-file": { "version": "1.1.0", - "resolved": "http://registry.npmjs.org/load-json-file/-/load-json-file-1.1.0.tgz", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-1.1.0.tgz", "integrity": "sha1-lWkFcI1YtLq0wiYbBPWfMcmTdMA=", "dev": true, "requires": { @@ -9397,27 +8735,38 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } } }, "loader-runner": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-2.3.1.tgz", - "integrity": "sha512-By6ZFY7ETWOc9RFaAIb23IjJVcM4dvJC/N57nmdz9RSkMXvAXGI7SyVlAw3v8vjtDRlqThgVDVmTnr9fqMlxkw==", + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-2.4.0.tgz", + "integrity": "sha512-Jsmr89RcXGIwivFY21FcRrisYZfvLMTWx5kOLc+JTxtpBOG6xML0vzbc6SEQG2FO9/4Fc3wW4LVcB5DmGflaRw==", "dev": true }, "loader-utils": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.1.0.tgz", - "integrity": "sha1-yYrvSIvM7aL/teLeZG1qdUQp9c0=", + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.2.3.tgz", + "integrity": "sha512-fkpz8ejdnEMG3s37wGL07iSBDg99O9D5yflE9RGNH3hRdx9SOwYfnGYdZOUIZitN8E+E2vkq3MUMYMvPYl5ZZA==", "dev": true, "requires": { - "big.js": "^3.1.3", + "big.js": "^5.2.2", "emojis-list": "^2.0.0", - "json5": "^0.5.0" + "json5": "^1.0.1" + }, + "dependencies": { + "json5": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.1.tgz", + "integrity": "sha512-aKS4WQjPenRxiQsC93MNfjx+nbF4PAdYzmd/1JIj8HYzqfbu86beTuNgXDzPknWk0n0uARlyewZo4s++ES36Ow==", + "dev": true, + "requires": { + "minimist": "^1.2.0" + } + } } }, "locate-path": { @@ -9428,6 +8777,24 @@ "requires": { "p-locate": "^2.0.0", "path-exists": "^3.0.0" + }, + "dependencies": { + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } + } + }, + "lock-verify": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/lock-verify/-/lock-verify-2.0.2.tgz", + "integrity": "sha512-QNVwK0EGZBS4R3YQ7F1Ox8p41Po9VGl2QG/2GsuvTbkJZYSsPeWHKMbbH6iZMCHWSMww5nrJroZYnGzI4cePuw==", + "dev": true, + "requires": { + "npm-package-arg": "^5.1.2 || 6", + "semver": "^5.4.1" } }, "lodash": { @@ -9436,114 +8803,34 @@ "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg==", "dev": true }, - "lodash._basecopy": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/lodash._basecopy/-/lodash._basecopy-3.0.1.tgz", - "integrity": "sha1-jaDmqHbPNEwK2KVIghEd08XHyjY=", - "dev": true - }, - "lodash._basetostring": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/lodash._basetostring/-/lodash._basetostring-3.0.1.tgz", - "integrity": "sha1-0YYdh3+CSlL2aYMtyvPuFVZqB9U=", - "dev": true - }, - "lodash._basevalues": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/lodash._basevalues/-/lodash._basevalues-3.0.0.tgz", - "integrity": "sha1-W3dXYoAr3j0yl1A+JjAIIP32Ybc=", - "dev": true - }, - "lodash._getnative": { - "version": "3.9.1", - "resolved": "https://registry.npmjs.org/lodash._getnative/-/lodash._getnative-3.9.1.tgz", - "integrity": "sha1-VwvH3t5G1hzc3mh9ZdPuy6o6r/U=", - "dev": true - }, - "lodash._isiterateecall": { - "version": "3.0.9", - "resolved": "https://registry.npmjs.org/lodash._isiterateecall/-/lodash._isiterateecall-3.0.9.tgz", - "integrity": "sha1-UgOte6Ql+uhCRg5pbbnPPmqsBXw=", - "dev": true - }, - "lodash._reescape": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/lodash._reescape/-/lodash._reescape-3.0.0.tgz", - "integrity": "sha1-Kx1vXf4HyKNVdT5fJ/rH8c3hYWo=", - "dev": true - }, - "lodash._reevaluate": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/lodash._reevaluate/-/lodash._reevaluate-3.0.0.tgz", - "integrity": "sha1-WLx0xAZklTrgsSTYBpltrKQx4u0=", - "dev": true - }, "lodash._reinterpolate": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/lodash._reinterpolate/-/lodash._reinterpolate-3.0.0.tgz", "integrity": "sha1-DM8tiRZq8Ds2Y8eWU4t1rG4RTZ0=", "dev": true }, - "lodash._root": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/lodash._root/-/lodash._root-3.0.1.tgz", - "integrity": "sha1-+6HEUkwZ7ppfgTa0YJ8BfPTe1pI=", - "dev": true - }, "lodash.camelcase": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", "integrity": "sha1-soqmKIorn8ZRA1x3EfZathkDMaY=" }, + "lodash.clonedeep": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz", + "integrity": "sha1-4j8/nE+Pvd6HJSnBBxhXoIblzO8=", + "dev": true + }, "lodash.debounce": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", "integrity": "sha1-gteb/zCmfEAF/9XiUVMArZyk168=", "dev": true }, - "lodash.escape": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/lodash.escape/-/lodash.escape-3.2.0.tgz", - "integrity": "sha1-mV7g3BjBtIzJLv+ucaEKq1tIdpg=", - "dev": true, - "requires": { - "lodash._root": "^3.0.0" - } - }, - "lodash.isarguments": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/lodash.isarguments/-/lodash.isarguments-3.1.0.tgz", - "integrity": "sha1-L1c9hcaiQon/AGY7SRwdM4/zRYo=", - "dev": true - }, - "lodash.isarray": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/lodash.isarray/-/lodash.isarray-3.0.4.tgz", - "integrity": "sha1-eeTriMNqgSKvhvhEqpvNhRtfu1U=", - "dev": true - }, - "lodash.keys": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/lodash.keys/-/lodash.keys-3.1.2.tgz", - "integrity": "sha1-TbwEcrFWvlCgsoaFXRvQsMZWCYo=", - "dev": true, - "requires": { - "lodash._getnative": "^3.0.0", - "lodash.isarguments": "^3.0.0", - "lodash.isarray": "^3.0.0" - } - }, "lodash.padend": { "version": "4.6.1", "resolved": "https://registry.npmjs.org/lodash.padend/-/lodash.padend-4.6.1.tgz", "integrity": "sha1-U8y6BH0G4VjTEfRdpiX05J5vFm4=" }, - "lodash.restparam": { - "version": "3.6.1", - "resolved": "https://registry.npmjs.org/lodash.restparam/-/lodash.restparam-3.6.1.tgz", - "integrity": "sha1-k2pOMJ7zMKdkXtQUWYbIWuWyCAU=", - "dev": true - }, "lodash.sortby": { "version": "4.7.0", "resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz", @@ -9551,30 +8838,22 @@ "dev": true }, "lodash.template": { - "version": "3.6.2", - "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-3.6.2.tgz", - "integrity": "sha1-+M3sxhaaJVvpCYrosMU9N4kx0U8=", + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-4.4.0.tgz", + "integrity": "sha1-5zoDhcg1VZF0bgILmWecaQ5o+6A=", "dev": true, "requires": { - "lodash._basecopy": "^3.0.0", - "lodash._basetostring": "^3.0.0", - "lodash._basevalues": "^3.0.0", - "lodash._isiterateecall": "^3.0.0", - "lodash._reinterpolate": "^3.0.0", - "lodash.escape": "^3.0.0", - "lodash.keys": "^3.0.0", - "lodash.restparam": "^3.0.0", - "lodash.templatesettings": "^3.0.0" + "lodash._reinterpolate": "~3.0.0", + "lodash.templatesettings": "^4.0.0" } }, "lodash.templatesettings": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-3.1.1.tgz", - "integrity": "sha1-+zB4RHU7Zrnxr6VOJix0UwfbqOU=", + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-4.1.0.tgz", + "integrity": "sha1-K01OlbpEDZFf8IvImeRVNmZxMxY=", "dev": true, "requires": { - "lodash._reinterpolate": "^3.0.0", - "lodash.escape": "^3.0.0" + "lodash._reinterpolate": "~3.0.0" } }, "log-driver": { @@ -9583,58 +8862,6 @@ "integrity": "sha512-U7KCmLdqsGHBLeWqYlFA0V0Sl6P08EE1ZrmA9cxjUE0WVqT9qnyVDPz1kzpFEP0jdJuFnasWIfSd7fsaNXkpbg==", "dev": true }, - "log-symbols": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-2.2.0.tgz", - "integrity": "sha512-VeIAFslyIerEJLXHziedo2basKbMKtTw3vfn5IzG0XTjhAVEJyNHnL2p7vc+wBDSdQuUpNw3M2u6xb9QsAY5Eg==", - "dev": true, - "requires": { - "chalk": "^2.0.1" - } - }, - "log-update": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/log-update/-/log-update-1.0.2.tgz", - "integrity": "sha1-GZKfZMQJPS0ucHWh2tivWcKWuNE=", - "dev": true, - "requires": { - "ansi-escapes": "^1.0.0", - "cli-cursor": "^1.0.2" - }, - "dependencies": { - "ansi-escapes": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-1.4.0.tgz", - "integrity": "sha1-06ioOzGapneTZisT52HHkRQiMG4=", - "dev": true - }, - "cli-cursor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-1.0.2.tgz", - "integrity": "sha1-ZNo/fValRBLll5S9Ytw1KV6PKYc=", - "dev": true, - "requires": { - "restore-cursor": "^1.0.1" - } - }, - "onetime": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/onetime/-/onetime-1.1.0.tgz", - "integrity": "sha1-ofeDj4MUxRbwXs78vEzP4EtO14k=", - "dev": true - }, - "restore-cursor": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-1.0.1.tgz", - "integrity": "sha1-NGYfRohjJ/7SmRR5FSJS35LapUE=", - "dev": true, - "requires": { - "exit-hook": "^1.0.0", - "onetime": "^1.0.0" - } - } - } - }, "loose-envify": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", @@ -9655,9 +8882,9 @@ } }, "lru-cache": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.3.tgz", - "integrity": "sha512-fFEhvcgzuIoJVUF8fYr5KR0YqxD238zgObTps31YdADwPPAp82a4M8TrckkWyx7ekNlf9aBcVn81cFwwXngrJA==", + "version": "4.1.5", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.5.tgz", + "integrity": "sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g==", "dev": true, "requires": { "pseudomap": "^1.0.2", @@ -9670,397 +8897,113 @@ "integrity": "sha1-Jzi9nw089PhEkMVzbEhpmsYyzaM=", "dev": true, "requires": { - "es5-ext": "~0.10.2" - } - }, - "make-dir": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-1.3.0.tgz", - "integrity": "sha512-2w31R7SJtieJJnQtGc7RVL2StM2vGYVfqUOvUDxH6bC6aJTxPxTF0GnIgCyu7tjockiUWAYQRbxa7vKn34s5sQ==", - "dev": true, - "requires": { - "pify": "^3.0.0" - } - }, - "make-error": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.5.tgz", - "integrity": "sha512-c3sIjNUow0+8swNwVpqoH4YCShKNFkMaw6oH1mNS2haDZQqkeZFlHS3dhoeEbKKmJB4vXpJucU6oH75aDYeE9g==", - "dev": true - }, - "make-fetch-happen": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-4.0.1.tgz", - "integrity": "sha512-7R5ivfy9ilRJ1EMKIOziwrns9fGeAD4bAha8EB7BIiBBLHm2KeTUGCrICFt2rbHfzheTLynv50GnNTK1zDTrcQ==", - "dev": true, - "requires": { - "agentkeepalive": "^3.4.1", - "cacache": "^11.0.1", - "http-cache-semantics": "^3.8.1", - "http-proxy-agent": "^2.1.0", - "https-proxy-agent": "^2.2.1", - "lru-cache": "^4.1.2", - "mississippi": "^3.0.0", - "node-fetch-npm": "^2.0.2", - "promise-retry": "^1.1.1", - "socks-proxy-agent": "^4.0.0", - "ssri": "^6.0.0" - } - }, - "make-iterator": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/make-iterator/-/make-iterator-1.0.1.tgz", - "integrity": "sha512-pxiuXh0iVEq7VM7KMIhs5gxsfxCux2URptUQaXo4iZZJxBAzTPOLE2BumO5dbfVYq/hBJFBR/a1mFDmOx5AGmw==", - "dev": true, - "requires": { - "kind-of": "^6.0.2" - }, - "dependencies": { - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - } - } - }, - "makeerror": { - "version": "1.0.11", - "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.11.tgz", - "integrity": "sha1-4BpckQnyr3lmDk6LlYd5AYT1qWw=", - "dev": true, - "requires": { - "tmpl": "1.0.x" - } - }, - "map-age-cleaner": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz", - "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==", - "dev": true, - "requires": { - "p-defer": "^1.0.0" - } - }, - "map-cache": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/map-cache/-/map-cache-0.2.2.tgz", - "integrity": "sha1-wyq9C9ZSXZsFFkW7TyasXcmKDb8=", - "dev": true - }, - "map-obj": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/map-obj/-/map-obj-2.0.0.tgz", - "integrity": "sha1-plzSkIepJZi4eRJXpSPgISIqwfk=", - "dev": true - }, - "map-visit": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz", - "integrity": "sha1-7Nyo8TFE5mDxtb1B8S80edmN+48=", - "dev": true, - "requires": { - "object-visit": "^1.0.0" - } - }, - "marked": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/marked/-/marked-0.4.0.tgz", - "integrity": "sha512-tMsdNBgOsrUophCAFQl0XPe6Zqk/uy9gnue+jIIKhykO51hxyu6uNx7zBPy0+y/WKYVZZMspV9YeXLNdKk+iYw==", - "dev": true - }, - "matchdep": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/matchdep/-/matchdep-2.0.0.tgz", - "integrity": "sha1-xvNINKDY28OzfCfui7yyfHd1WC4=", - "dev": true, - "requires": { - "findup-sync": "^2.0.0", - "micromatch": "^3.0.4", - "resolve": "^1.4.0", - "stack-trace": "0.0.10" - }, - "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } + "es5-ext": "~0.10.2" } }, - "math-random": { + "make-dir": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-1.3.0.tgz", + "integrity": "sha512-2w31R7SJtieJJnQtGc7RVL2StM2vGYVfqUOvUDxH6bC6aJTxPxTF0GnIgCyu7tjockiUWAYQRbxa7vKn34s5sQ==", + "dev": true, + "requires": { + "pify": "^3.0.0" + } + }, + "make-error": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.5.tgz", + "integrity": "sha512-c3sIjNUow0+8swNwVpqoH4YCShKNFkMaw6oH1mNS2haDZQqkeZFlHS3dhoeEbKKmJB4vXpJucU6oH75aDYeE9g==", + "dev": true + }, + "make-fetch-happen": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-4.0.1.tgz", + "integrity": "sha512-7R5ivfy9ilRJ1EMKIOziwrns9fGeAD4bAha8EB7BIiBBLHm2KeTUGCrICFt2rbHfzheTLynv50GnNTK1zDTrcQ==", + "dev": true, + "requires": { + "agentkeepalive": "^3.4.1", + "cacache": "^11.0.1", + "http-cache-semantics": "^3.8.1", + "http-proxy-agent": "^2.1.0", + "https-proxy-agent": "^2.2.1", + "lru-cache": "^4.1.2", + "mississippi": "^3.0.0", + "node-fetch-npm": "^2.0.2", + "promise-retry": "^1.1.1", + "socks-proxy-agent": "^4.0.0", + "ssri": "^6.0.0" + } + }, + "make-iterator": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/math-random/-/math-random-1.0.1.tgz", - "integrity": "sha1-izqsWIuKZuSXXjzepn97sylgH6w=", + "resolved": "https://registry.npmjs.org/make-iterator/-/make-iterator-1.0.1.tgz", + "integrity": "sha512-pxiuXh0iVEq7VM7KMIhs5gxsfxCux2URptUQaXo4iZZJxBAzTPOLE2BumO5dbfVYq/hBJFBR/a1mFDmOx5AGmw==", + "dev": true, + "requires": { + "kind-of": "^6.0.2" + } + }, + "makeerror": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.11.tgz", + "integrity": "sha1-4BpckQnyr3lmDk6LlYd5AYT1qWw=", + "dev": true, + "requires": { + "tmpl": "1.0.x" + } + }, + "map-age-cleaner": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz", + "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==", + "dev": true, + "requires": { + "p-defer": "^1.0.0" + } + }, + "map-cache": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/map-cache/-/map-cache-0.2.2.tgz", + "integrity": "sha1-wyq9C9ZSXZsFFkW7TyasXcmKDb8=", + "dev": true + }, + "map-obj": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/map-obj/-/map-obj-2.0.0.tgz", + "integrity": "sha1-plzSkIepJZi4eRJXpSPgISIqwfk=", + "dev": true + }, + "map-visit": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz", + "integrity": "sha1-7Nyo8TFE5mDxtb1B8S80edmN+48=", + "dev": true, + "requires": { + "object-visit": "^1.0.0" + } + }, + "marked": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/marked/-/marked-0.4.0.tgz", + "integrity": "sha512-tMsdNBgOsrUophCAFQl0XPe6Zqk/uy9gnue+jIIKhykO51hxyu6uNx7zBPy0+y/WKYVZZMspV9YeXLNdKk+iYw==", + "dev": true + }, + "matchdep": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/matchdep/-/matchdep-2.0.0.tgz", + "integrity": "sha1-xvNINKDY28OzfCfui7yyfHd1WC4=", + "dev": true, + "requires": { + "findup-sync": "^2.0.0", + "micromatch": "^3.0.4", + "resolve": "^1.4.0", + "stack-trace": "0.0.10" + } + }, + "math-random": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/math-random/-/math-random-1.0.4.tgz", + "integrity": "sha512-rUxjysqif/BZQH2yhd5Aaq7vXMSx9NdEsQcyA07uEzIvxgI7zIr33gGsh+RU0/XjmQpCW7RsVof1vlkvQVCK5A==", "dev": true }, "md5.js": { @@ -10083,6 +9026,16 @@ "mimic-fn": "^1.0.0" } }, + "memfs": { + "version": "2.15.0", + "resolved": "https://registry.npmjs.org/memfs/-/memfs-2.15.0.tgz", + "integrity": "sha512-vktLqfHB1K4I9oiWlG4VjbztEreU5LqgnTnlVimr4bKNhJwjTmKg5+eYIimmNiKVUolTUrWSy2k/KEyqqLqZrQ==", + "dev": true, + "requires": { + "fast-extend": "0.0.2", + "fs-monkey": "^0.3.3" + } + }, "memoizee": { "version": "0.4.14", "resolved": "https://registry.npmjs.org/memoizee/-/memoizee-0.4.14.tgz", @@ -10132,6 +9085,15 @@ "trim-newlines": "^2.0.0" }, "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, "load-json-file": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", @@ -10214,24 +9176,24 @@ "dev": true }, "micromatch": { - "version": "2.3.11", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", - "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "version": "3.1.10", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", + "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", "dev": true, "requires": { - "arr-diff": "^2.0.0", - "array-unique": "^0.2.1", - "braces": "^1.8.2", - "expand-brackets": "^0.1.4", - "extglob": "^0.3.1", - "filename-regex": "^2.0.0", - "is-extglob": "^1.0.0", - "is-glob": "^2.0.1", - "kind-of": "^3.0.2", - "normalize-path": "^2.0.1", - "object.omit": "^2.0.0", - "parse-glob": "^3.0.4", - "regex-cache": "^0.4.2" + "arr-diff": "^4.0.0", + "array-unique": "^0.3.2", + "braces": "^2.3.1", + "define-property": "^2.0.2", + "extend-shallow": "^3.0.2", + "extglob": "^2.0.4", + "fragment-cache": "^0.2.1", + "kind-of": "^6.0.2", + "nanomatch": "^1.2.9", + "object.pick": "^1.3.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.2" } }, "miller-rabin": { @@ -10288,7 +9250,7 @@ }, "minimist": { "version": "1.2.0", - "resolved": "http://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", "integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=", "dev": true }, @@ -10321,9 +9283,9 @@ } }, "minizlib": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-1.1.1.tgz", - "integrity": "sha512-TrfjCjk4jLhcJyGMYymBH6oTXcWjYbUAXTHDbtnWHjZC25h0cdajHuPE1zxb4DVmu8crfh+HwH/WMuyLG0nHBg==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-1.2.1.tgz", + "integrity": "sha512-7+4oTUOWKg7AuL3vloEWekXY2/D20cevzsrNT2kGWm+39J9hGTCBv8VI5Pm5lXZ/o3/mdR4f8rflAPhnQb8mPA==", "dev": true, "requires": { "minipass": "^2.2.1" @@ -10345,6 +9307,18 @@ "pumpify": "^1.3.3", "stream-each": "^1.1.0", "through2": "^2.0.0" + }, + "dependencies": { + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + } } }, "mixin-deep": { @@ -10370,7 +9344,7 @@ }, "mkdirp": { "version": "0.5.1", - "resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", "dev": true, "requires": { @@ -10379,7 +9353,7 @@ "dependencies": { "minimist": { "version": "0.0.8", - "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=", "dev": true } @@ -10410,7 +9384,7 @@ }, "pify": { "version": "2.3.0", - "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } @@ -10448,13 +9422,14 @@ "minimatch": "^3.0.0" } }, - "multipipe": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/multipipe/-/multipipe-0.1.2.tgz", - "integrity": "sha1-Ko8t33Du1WTf8tV/HhoTfZ8FB4s=", + "multistream": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/multistream/-/multistream-2.1.1.tgz", + "integrity": "sha512-xasv76hl6nr1dEy3lPvy7Ej7K/Lx3O/FCvwge8PeVJpciPPoNCbaANcNiBug3IpdvTveZUcAV0DJzdnUDMesNQ==", "dev": true, "requires": { - "duplexer2": "0.0.2" + "inherits": "^2.0.1", + "readable-stream": "^2.0.5" } }, "mute-stdout": { @@ -10470,9 +9445,9 @@ "dev": true }, "nan": { - "version": "2.11.1", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.11.1.tgz", - "integrity": "sha512-iji6k87OSXa0CcrLl9z+ZiYSuR2o+c0bGuNmXdrhTQTakxytAFsC56SArGYoiHlJlFoHSnvmhpceZJaXkVuOtA==", + "version": "2.12.1", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.12.1.tgz", + "integrity": "sha512-JY7V6lRkStKcKTvHO5NVSQRv+RV+FIL5pvDoLiAtSL9pKlC5x9PKQcZDsq7m4FO4d57mkhC6Z+QhAh3Jdk5JFw==", "dev": true, "optional": true }, @@ -10493,26 +9468,6 @@ "regex-not": "^1.0.0", "snapdragon": "^0.8.1", "to-regex": "^3.0.1" - }, - "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - } } }, "natural-compare": { @@ -10572,7 +9527,7 @@ "dependencies": { "semver": { "version": "5.3.0", - "resolved": "http://registry.npmjs.org/semver/-/semver-5.3.0.tgz", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.3.0.tgz", "integrity": "sha1-myzl094C0XxgEq0yaqa00M9U+U8=", "dev": true } @@ -10585,9 +9540,9 @@ "dev": true }, "node-libs-browser": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/node-libs-browser/-/node-libs-browser-2.1.0.tgz", - "integrity": "sha512-5AzFzdoIMb89hBGMZglEegffzgRg+ZFoUmisQ8HI4j1KDdpx13J0taNp2y9xPbur6W61gepGDDotGBVQ7mfUCg==", + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/node-libs-browser/-/node-libs-browser-2.2.0.tgz", + "integrity": "sha512-5MQunG/oyOaBdttrL40dA7bUfPORLRWMUJLQtMg7nluxUvk5XwnLdL9twQHFAjRx/y7mIMkLKT9++qPbbk6BZA==", "dev": true, "requires": { "assert": "^1.1.1", @@ -10597,7 +9552,7 @@ "constants-browserify": "^1.0.0", "crypto-browserify": "^3.11.0", "domain-browser": "^1.1.1", - "events": "^1.0.0", + "events": "^3.0.0", "https-browserify": "^1.0.0", "os-browserify": "^0.3.0", "path-browserify": "0.0.0", @@ -10611,8 +9566,16 @@ "timers-browserify": "^2.0.4", "tty-browserify": "0.0.0", "url": "^0.11.0", - "util": "^0.10.3", + "util": "^0.11.0", "vm-browserify": "0.0.4" + }, + "dependencies": { + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", + "dev": true + } } }, "node-notifier": { @@ -10696,6 +9659,12 @@ } } }, + "npm-logical-tree": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/npm-logical-tree/-/npm-logical-tree-1.2.1.tgz", + "integrity": "sha512-AJI/qxDB2PWI4LG1CYN579AY1vCiNyWfkiquCsJWqntRu/WwimVrC8yXeILBFHDwxfOejxewlmnvW9XXjMlYIg==", + "dev": true + }, "npm-package-arg": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/npm-package-arg/-/npm-package-arg-6.1.0.tgz", @@ -10709,24 +9678,15 @@ } }, "npm-packlist": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/npm-packlist/-/npm-packlist-1.1.12.tgz", - "integrity": "sha512-WJKFOVMeAlsU/pjXuqVdzU0WfgtIBCupkEVwn+1Y0ERAbUfWw8R4GjgVbaKnUjRoD2FoQbHOCbOyT5Mbs9Lw4g==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/npm-packlist/-/npm-packlist-1.2.0.tgz", + "integrity": "sha512-7Mni4Z8Xkx0/oegoqlcao/JpPCPEMtUvsmB0q7mgvlMinykJLSRTYuFqoQLYgGY8biuxIeiHO+QNJKbCfljewQ==", "dev": true, "requires": { "ignore-walk": "^3.0.1", "npm-bundled": "^1.0.1" } }, - "npm-path": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/npm-path/-/npm-path-2.0.4.tgz", - "integrity": "sha512-IFsj0R9C7ZdR5cP+ET342q77uSRdtWOlWpih5eC+lu29tIDbNEgDbzgVJ5UFvYHWhxDZ5TFkJafFioO0pPQjCw==", - "dev": true, - "requires": { - "which": "^1.2.10" - } - }, "npm-pick-manifest": { "version": "2.2.3", "resolved": "https://registry.npmjs.org/npm-pick-manifest/-/npm-pick-manifest-2.2.3.tgz", @@ -10738,10 +9698,21 @@ "semver": "^5.4.1" } }, + "npm-profile": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/npm-profile/-/npm-profile-4.0.1.tgz", + "integrity": "sha512-NQ1I/1Q7YRtHZXkcuU1/IyHeLy6pd+ScKg4+DQHdfsm769TGq6HPrkbuNJVJS4zwE+0mvvmeULzQdWn2L2EsVA==", + "dev": true, + "requires": { + "aproba": "^1.1.2 || 2", + "figgy-pudding": "^3.4.1", + "npm-registry-fetch": "^3.8.0" + } + }, "npm-registry-fetch": { - "version": "3.8.0", - "resolved": "https://registry.npmjs.org/npm-registry-fetch/-/npm-registry-fetch-3.8.0.tgz", - "integrity": "sha512-hrw8UMD+Nob3Kl3h8Z/YjmKamb1gf7D1ZZch2otrIXM3uFLB5vjEY6DhMlq80z/zZet6eETLbOXcuQudCB3Zpw==", + "version": "3.9.0", + "resolved": "https://registry.npmjs.org/npm-registry-fetch/-/npm-registry-fetch-3.9.0.tgz", + "integrity": "sha512-srwmt8YhNajAoSAaDWndmZgx89lJwIZ1GWxOuckH4Coek4uHv5S+o/l9FLQe/awA+JwTnj4FJHldxhlXdZEBmw==", "dev": true, "requires": { "JSONStream": "^1.3.4", @@ -10841,17 +9812,6 @@ "path-key": "^2.0.0" } }, - "npm-which": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/npm-which/-/npm-which-3.0.1.tgz", - "integrity": "sha1-kiXybsOihcIJyuZ8OxGmtKtxQKo=", - "dev": true, - "requires": { - "commander": "^2.9.0", - "npm-path": "^2.0.2", - "which": "^1.2.10" - } - }, "npmlog": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-4.1.2.tgz", @@ -10907,6 +9867,15 @@ "requires": { "is-descriptor": "^0.1.0" } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } } } }, @@ -10923,14 +9892,6 @@ "dev": true, "requires": { "isobject": "^3.0.0" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "object.assign": { @@ -10955,23 +9916,6 @@ "array-slice": "^1.0.0", "for-own": "^1.0.0", "isobject": "^3.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "object.getownpropertydescriptors": { @@ -10992,17 +9936,6 @@ "requires": { "for-own": "^1.0.0", "make-iterator": "^1.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - } } }, "object.omit": { @@ -11013,6 +9946,17 @@ "requires": { "for-own": "^0.1.4", "is-extendable": "^0.1.1" + }, + "dependencies": { + "for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", + "dev": true, + "requires": { + "for-in": "^1.0.1" + } + } } }, "object.pick": { @@ -11022,14 +9966,6 @@ "dev": true, "requires": { "isobject": "^3.0.1" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "object.reduce": { @@ -11040,17 +9976,6 @@ "requires": { "for-own": "^1.0.0", "make-iterator": "^1.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - } } }, "once": { @@ -11083,7 +10008,7 @@ "dependencies": { "minimist": { "version": "0.0.10", - "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz", "integrity": "sha1-3j+YVD2/lggr5IrRoMfNqDYwHc8=", "dev": true } @@ -11134,7 +10059,7 @@ }, "os-locale": { "version": "1.4.0", - "resolved": "http://registry.npmjs.org/os-locale/-/os-locale-1.4.0.tgz", + "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-1.4.0.tgz", "integrity": "sha1-IPnxeuKe00XoveWDsT0gCYA8FNk=", "dev": true, "requires": { @@ -11171,7 +10096,7 @@ }, "p-is-promise": { "version": "1.1.0", - "resolved": "http://registry.npmjs.org/p-is-promise/-/p-is-promise-1.1.0.tgz", + "resolved": "https://registry.npmjs.org/p-is-promise/-/p-is-promise-1.1.0.tgz", "integrity": "sha1-nJRWmJ6fZYgBewQ01WCXZ1w9oF4=", "dev": true }, @@ -11236,17 +10161,17 @@ } }, "pacote": { - "version": "9.2.3", - "resolved": "https://registry.npmjs.org/pacote/-/pacote-9.2.3.tgz", - "integrity": "sha512-Y3+yY3nBRAxMlZWvr62XLJxOwCmG9UmkGZkFurWHoCjqF0cZL72cTOCRJTvWw8T4OhJS2RTg13x4oYYriauvEw==", + "version": "9.4.1", + "resolved": "https://registry.npmjs.org/pacote/-/pacote-9.4.1.tgz", + "integrity": "sha512-YKSRsQqmeHxgra0KCdWA2FtVxDPUlBiCdmew+mSe44pzlx5t1ViRMWiQg18T+DREA+vSqYfKzynaToFR4hcKHw==", "dev": true, "requires": { - "bluebird": "^3.5.2", - "cacache": "^11.2.0", + "bluebird": "^3.5.3", + "cacache": "^11.3.2", "figgy-pudding": "^3.5.1", "get-stream": "^4.1.0", "glob": "^7.1.3", - "lru-cache": "^4.1.3", + "lru-cache": "^5.1.1", "make-fetch-happen": "^4.0.1", "minimatch": "^3.0.4", "minipass": "^2.3.5", @@ -11265,7 +10190,7 @@ "safe-buffer": "^5.1.2", "semver": "^5.6.0", "ssri": "^6.0.1", - "tar": "^4.4.6", + "tar": "^4.4.8", "unique-filename": "^1.1.1", "which": "^1.3.1" }, @@ -11279,6 +10204,25 @@ "pump": "^3.0.0" } }, + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "tar": { "version": "4.4.8", "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", @@ -11302,10 +10246,18 @@ } } }, + "pad-left": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/pad-left/-/pad-left-2.1.0.tgz", + "integrity": "sha1-FuajstRKjhOMsIOMx8tAOk/J6ZQ=", + "requires": { + "repeat-string": "^1.5.4" + } + }, "pako": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.7.tgz", - "integrity": "sha512-3HNK5tW4x8o5mO8RuHZp3Ydw9icZXx0RANAOMzlMzx7LVXhMJ4mo3MOBpzyd7r/+RUu8BmndP47LXT+vzjtWcQ==", + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.8.tgz", + "integrity": "sha512-6i0HVbUfcKaTv+EG8ZTr75az7GFXcLYk9UyLEg7Notv/Ma+z/UG3TCoz6GiNeOrn1E/e63I0X/Hpw18jHOTUnA==", "dev": true }, "parallel-transform": { @@ -11320,16 +10272,17 @@ } }, "parse-asn1": { - "version": "5.1.1", - "resolved": "http://registry.npmjs.org/parse-asn1/-/parse-asn1-5.1.1.tgz", - "integrity": "sha512-KPx7flKXg775zZpnp9SxJlz00gTd4BmJ2yJufSc44gMCRrRQ7NSzAcSJQfifuOLgW6bEi+ftrALtsgALeB2Adw==", + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/parse-asn1/-/parse-asn1-5.1.3.tgz", + "integrity": "sha512-VrPoetlz7B/FqjBLD2f5wBVZvsZVLnRUrxVLfRYhGXCODa/NWE4p3Wp+6+aV3ZPL3KM7/OZmxDIwwijD7yuucg==", "dev": true, "requires": { "asn1.js": "^4.0.0", "browserify-aes": "^1.0.0", "create-hash": "^1.1.0", "evp_bytestokey": "^1.0.0", - "pbkdf2": "^3.0.3" + "pbkdf2": "^3.0.3", + "safe-buffer": "^5.1.1" } }, "parse-filepath": { @@ -11359,6 +10312,23 @@ "is-dotfile": "^1.0.0", "is-extglob": "^1.0.0", "is-glob": "^2.0.0" + }, + "dependencies": { + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + } } }, "parse-json": { @@ -11370,6 +10340,12 @@ "error-ex": "^1.2.0" } }, + "parse-node-version": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/parse-node-version/-/parse-node-version-1.0.0.tgz", + "integrity": "sha512-02GTVHD1u0nWc20n2G7WX/PgdhNFG04j5fi1OkaJzPWLTcf6vh6229Lta1wTmXG/7Dg42tCssgkccVt7qvd8Kg==", + "dev": true + }, "parse-passwd": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/parse-passwd/-/parse-passwd-1.0.0.tgz", @@ -11401,10 +10377,13 @@ "dev": true }, "path-exists": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", - "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", - "dev": true + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-2.1.0.tgz", + "integrity": "sha1-D+tsZPD8UY2adU3V77YscCJ2H0s=", + "dev": true, + "requires": { + "pinkie-promise": "^2.0.0" + } }, "path-is-absolute": { "version": "1.0.1", @@ -11458,7 +10437,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } @@ -11517,6 +10496,17 @@ "dev": true, "requires": { "find-up": "^2.1.0" + }, + "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + } } }, "platform": { @@ -11525,15 +10515,6 @@ "integrity": "sha512-TuvHS8AOIZNAlE77WUDiR4rySV/VMptyMfcfeoMgs4P8apaZM3JrnbzBiixKUv+XR6i+BXrQh8WAnjaSPFO65Q==", "dev": true }, - "please-upgrade-node": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/please-upgrade-node/-/please-upgrade-node-3.1.1.tgz", - "integrity": "sha512-KY1uHnQ2NlQHqIJQpnh/i54rKkuxCEBx+voJIS/Mvb+L2iYd2NMotwduhKTMjfC1uKoX3VXOxLjIYG66dfJTVQ==", - "dev": true, - "requires": { - "semver-compare": "^1.0.0" - } - }, "plugin-error": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/plugin-error/-/plugin-error-1.0.1.tgz", @@ -11544,23 +10525,6 @@ "arr-diff": "^4.0.0", "arr-union": "^3.1.0", "extend-shallow": "^3.0.2" - }, - "dependencies": { - "ansi-colors": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/ansi-colors/-/ansi-colors-1.1.0.tgz", - "integrity": "sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==", - "dev": true, - "requires": { - "ansi-wrap": "^0.1.0" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - } } }, "pn": { @@ -11607,7 +10571,7 @@ }, "pretty-hrtime": { "version": "1.0.3", - "resolved": "http://registry.npmjs.org/pretty-hrtime/-/pretty-hrtime-1.0.3.tgz", + "resolved": "https://registry.npmjs.org/pretty-hrtime/-/pretty-hrtime-1.0.3.tgz", "integrity": "sha1-t+PqQkNaTJsnWdmeDyAesZWALuE=", "dev": true }, @@ -11624,15 +10588,15 @@ "dev": true }, "process-nextick-args": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", - "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", + "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", "dev": true }, "progress": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.2.tgz", - "integrity": "sha512-/OLz5F9beZUWwSHZDreXgap1XShX6W+DCHQCqwCF7uZ88s6uTlD2cR3JBE77SegCmNtb1Idst+NfmwcdU6KVhw==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", + "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==", "dev": true }, "promise": { @@ -11707,9 +10671,9 @@ "dev": true }, "psl": { - "version": "1.1.29", - "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.29.tgz", - "integrity": "sha512-AeUmQ0oLN02flVHXWh9sSJF7mcdFq0ppid/JkErufc3hGIV/AMa8Fo9VgDo/cT2jFdOWoFvHp90qqBH54W+gjQ==", + "version": "1.1.31", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.31.tgz", + "integrity": "sha512-/6pt4+C+T+wZUieKR620OpzN/LlnNKuWjy1iFLQ/UG35JqHlR/89MP1d96dUfkf6Dne3TuLQzOYEYshJ+Hx8mw==", "dev": true }, "public-encrypt": { @@ -11727,9 +10691,9 @@ } }, "pump": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", + "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", "dev": true, "requires": { "end-of-stream": "^1.1.0", @@ -11745,24 +10709,12 @@ "duplexify": "^3.6.0", "inherits": "^2.0.3", "pump": "^2.0.0" - }, - "dependencies": { - "pump": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", - "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", - "dev": true, - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - } } }, "punycode": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", - "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", + "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", "dev": true }, "q": { @@ -11811,12 +10763,6 @@ "resolved": "https://registry.npmjs.org/is-number/-/is-number-4.0.0.tgz", "integrity": "sha512-rSklcAIlf1OmFdyAqbnWTLVelsQ58uvZ66S/ZyawjWqIviTWCjg2PzVGw8WUA+nNuPTqb4wgA+NszrJ+08LlgQ==", "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -11895,350 +10841,61 @@ } }, "read-pkg-up": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-1.0.1.tgz", - "integrity": "sha1-nWPBMnbAZZGNV/ACpX9AobZD+wI=", - "dev": true, - "requires": { - "find-up": "^1.0.0", - "read-pkg": "^1.0.0" - }, - "dependencies": { - "find-up": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-1.1.2.tgz", - "integrity": "sha1-ay6YIrGizgpgq2TWEOzK1TyyTQ8=", - "dev": true, - "requires": { - "path-exists": "^2.0.0", - "pinkie-promise": "^2.0.0" - } - }, - "path-exists": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-2.1.0.tgz", - "integrity": "sha1-D+tsZPD8UY2adU3V77YscCJ2H0s=", - "dev": true, - "requires": { - "pinkie-promise": "^2.0.0" - } - } - } - }, - "readable-stream": { - "version": "2.3.6", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", - "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", - "dev": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "readdir-scoped-modules": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/readdir-scoped-modules/-/readdir-scoped-modules-1.0.2.tgz", - "integrity": "sha1-n6+jfShr5dksuuve4DDcm19AZ0c=", - "dev": true, - "requires": { - "debuglog": "^1.0.1", - "dezalgo": "^1.0.0", - "graceful-fs": "^4.1.2", - "once": "^1.3.0" - } - }, - "readdirp": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-2.2.1.tgz", - "integrity": "sha512-1JU/8q+VgFZyxwrJ+SVIOsh+KywWGpds3NTqikiKpDMZWScmAYyKIgqkO+ARvNWJfXeXR1zxz7aHF4u4CyH6vQ==", - "dev": true, - "requires": { - "graceful-fs": "^4.1.11", - "micromatch": "^3.1.10", - "readable-stream": "^2.0.2" - }, - "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-1.0.1.tgz", + "integrity": "sha1-nWPBMnbAZZGNV/ACpX9AobZD+wI=", + "dev": true, + "requires": { + "find-up": "^1.0.0", + "read-pkg": "^1.0.0" + } + }, + "readable-stream": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", + "dev": true, + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + }, + "dependencies": { + "process-nextick-args": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", + "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } } } }, + "readdir-scoped-modules": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/readdir-scoped-modules/-/readdir-scoped-modules-1.0.2.tgz", + "integrity": "sha1-n6+jfShr5dksuuve4DDcm19AZ0c=", + "dev": true, + "requires": { + "debuglog": "^1.0.1", + "dezalgo": "^1.0.0", + "graceful-fs": "^4.1.2", + "once": "^1.3.0" + } + }, + "readdirp": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-2.2.1.tgz", + "integrity": "sha512-1JU/8q+VgFZyxwrJ+SVIOsh+KywWGpds3NTqikiKpDMZWScmAYyKIgqkO+ARvNWJfXeXR1zxz7aHF4u4CyH6vQ==", + "dev": true, + "requires": { + "graceful-fs": "^4.1.11", + "micromatch": "^3.1.10", + "readable-stream": "^2.0.2" + } + }, "realpath-native": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/realpath-native/-/realpath-native-1.0.2.tgz", @@ -12333,8 +10990,7 @@ "repeat-string": { "version": "1.6.1", "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz", - "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=", - "dev": true + "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=" }, "repeating": { "version": "2.0.1", @@ -12423,12 +11079,12 @@ "dev": true }, "resolve": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.8.1.tgz", - "integrity": "sha512-AicPrAC7Qu1JxPCZ9ZgCZlY35QgFnNqc+0LtbRNxnVw4TXvjQ72wnuL9JQcEBgXkI9JM8MsT9kaQoHcpCRJOYA==", + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.10.0.tgz", + "integrity": "sha512-3sUr9aq5OfSg2S9pNtPA9hL1FVEAjvfOC4leW0SNf/mpnaakz2a9femSd6LqAww2RaFctwyf1lCqnTHuF1rxDg==", "dev": true, "requires": { - "path-parse": "^1.0.5" + "path-parse": "^1.0.6" } }, "resolve-cwd": { @@ -12494,12 +11150,12 @@ "dev": true }, "rimraf": { - "version": "2.6.2", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", - "integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", + "version": "2.6.3", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", + "integrity": "sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==", "dev": true, "requires": { - "glob": "^7.0.5" + "glob": "^7.1.3" } }, "ripemd160": { @@ -12582,14 +11238,6 @@ "dev": true, "requires": { "symbol-observable": "1.0.1" - }, - "dependencies": { - "symbol-observable": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.0.1.tgz", - "integrity": "sha1-g0D8RwLDEi310iKI+IKD9RPT/dQ=", - "dev": true - } } }, "safe-buffer": { @@ -12600,7 +11248,7 @@ }, "safe-regex": { "version": "1.1.0", - "resolved": "http://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", + "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", "integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=", "dev": true, "requires": { @@ -12628,292 +11276,6 @@ "minimist": "^1.1.1", "walker": "~1.0.5", "watch": "~0.18.0" - }, - "dependencies": { - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } } }, "sax": { @@ -12931,32 +11293,6 @@ "ajv": "^6.1.0", "ajv-errors": "^1.0.0", "ajv-keywords": "^3.1.0" - }, - "dependencies": { - "ajv": { - "version": "6.5.4", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.5.4.tgz", - "integrity": "sha512-4Wyjt8+t6YszqaXnLDfMmG/8AlO5Zbcsy3ATHncCzjW/NoPzAId8AK6749Ybjmdt+kUY1gP60fCu46oDxPv/mg==", - "dev": true, - "requires": { - "fast-deep-equal": "^2.0.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "fast-deep-equal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", - "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", - "dev": true - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true - } } }, "semver": { @@ -12965,12 +11301,6 @@ "integrity": "sha512-RS9R6R35NYgQn++fkDWaOmqGoj4Ek9gGs+DPxNUZKuwE183xjJroKvyo1IzVFeXvUrvmALy6FWD5xrdJT25gMg==", "dev": true }, - "semver-compare": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/semver-compare/-/semver-compare-1.0.0.tgz", - "integrity": "sha1-De4hahyUGrN+nvsXiPavxf9VN/w=", - "dev": true - }, "semver-greatest-satisfied-range": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/semver-greatest-satisfied-range/-/semver-greatest-satisfied-range-1.1.0.tgz", @@ -12981,9 +11311,9 @@ } }, "serialize-javascript": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-1.5.0.tgz", - "integrity": "sha512-Ga8c8NjAAp46Br4+0oZ2WxJCwIzwP60Gq1YPgU+39PiTVxyed/iKE/zyZI6+UlVYH5Q4PaQdHhcegIFPZTUfoQ==", + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-1.6.1.tgz", + "integrity": "sha512-A5MOagrPFga4YaKQSWHryl7AXvbQkEqpw4NNYMTNYUNV51bA8ABHgYFpqKx+YFFrw59xMV1qGH1R4AgoNIVgCw==", "dev": true }, "set-blocking": { @@ -13023,7 +11353,7 @@ }, "sha.js": { "version": "2.4.11", - "resolved": "http://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", + "resolved": "https://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", "integrity": "sha512-QMEp5B7cftE7APOjk5Y6xgrbWu+WkLVQwk8JNjZ8nKRciZaByEW6MubieAiToS7+dwvrjGhH8jRXz3MVd0AYqQ==", "dev": true, "requires": { @@ -13059,9 +11389,9 @@ } }, "shelljs": { - "version": "0.8.2", - "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.2.tgz", - "integrity": "sha512-pRXeNrCA2Wd9itwhvLp5LZQvPJ0wU6bcjaTMywHHGX5XWhVN2nzSu7WV0q+oUY7mGK3mgSkDDzP3MgjqdyIgbQ==", + "version": "0.8.3", + "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.3.tgz", + "integrity": "sha512-fc0BKlAWiLpwZljmOvAOTE/gXawtCoNrP5oaY7KIaQbbyHeQVg01pSEuEGvGh3HEdBU4baCD7wQBwADmM/7f7A==", "dev": true, "requires": { "glob": "^7.0.0", @@ -13104,12 +11434,6 @@ "integrity": "sha1-xB8vbDn8FtHNF61LXYlhFK5HDVU=", "dev": true }, - "slice-ansi": { - "version": "0.0.4", - "resolved": "http://registry.npmjs.org/slice-ansi/-/slice-ansi-0.0.4.tgz", - "integrity": "sha1-7b+JA/ZvfOL46v1s7tZeJkyDGzU=", - "dev": true - }, "slide": { "version": "1.1.6", "resolved": "https://registry.npmjs.org/slide/-/slide-1.1.6.tgz", @@ -13117,9 +11441,9 @@ "dev": true }, "smart-buffer": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.0.1.tgz", - "integrity": "sha512-RFqinRVJVcCAL9Uh1oVqE6FZkqsyLiVOYEZ20TqIOjuX7iFVJ+zsbs4RIghnw/pTs7mZvt8ZHhvm1ZUrR4fykg==", + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.0.2.tgz", + "integrity": "sha512-JDhEpTKzXusOqXZ0BUIdH+CjFdO/CR3tLlf5CN34IypI+xMmXW1uB16OOY8z3cICbJlDAVJzNbwBhNO0wt9OAw==", "dev": true }, "snapdragon": { @@ -13206,18 +11530,6 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -13228,16 +11540,27 @@ "dev": true, "requires": { "kind-of": "^3.2.0" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "socks": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/socks/-/socks-2.2.2.tgz", - "integrity": "sha512-g6wjBnnMOZpE0ym6e0uHSddz9p3a+WsBaaYQaBaSCJYvrC4IXykQR9MNGjLQf38e9iIIhp3b1/Zk8YZI3KGJ0Q==", + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/socks/-/socks-2.2.3.tgz", + "integrity": "sha512-+2r83WaRT3PXYoO/1z+RDEBE7Z2f9YcdQnJ0K/ncXXbV5gJ6wYfNAebYFYiiUjM6E4JyXnPY8cimwyvFYHVUUA==", "dev": true, "requires": { "ip": "^1.1.5", - "smart-buffer": "^4.0.1" + "smart-buffer": "4.0.2" } }, "socks-proxy-agent": { @@ -13316,9 +11639,9 @@ "dev": true }, "spdx-correct": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.0.2.tgz", - "integrity": "sha512-q9hedtzyXHr5S0A1vEPoK/7l8NpfkFYTq6iCY+Pno2ZbdZR6WexZFtqeVGkGxW3TEJMN914Z55EnAGMmenlIQQ==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.0.tgz", + "integrity": "sha512-lr2EZCctC2BNR7j7WzJ2FpDznxky1sjfxvvYEyzxNyb6lZXHODmEoJeFu4JupYlkfha1KZpJyoqiJ7pgA1qq8Q==", "dev": true, "requires": { "spdx-expression-parse": "^3.0.0", @@ -13342,9 +11665,9 @@ } }, "spdx-license-ids": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.2.tgz", - "integrity": "sha512-qky9CVt0lVIECkEsYbNILVnPvycuEBkXoMFLRWsREkomQLevYhtRKC+R91a5TOAQ3bCMjikRwhyaRqj1VYatYg==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.3.tgz", + "integrity": "sha512-uBIcIl3Ih6Phe3XHK1NqboJLdGfwr1UN3k6wSD1dZpmPsIkb8AGNbZYJ1fOBk834+Gxy8rpfDxrS6XLEMZMY2g==", "dev": true }, "split": { @@ -13381,9 +11704,9 @@ "dev": true }, "sshpk": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz", - "integrity": "sha512-Ra/OXQtuh0/enyl4ETZAfTaeksa6BXks5ZcjpSUNrjBr0DvrJKX+1fsKDPpT9TBXgHAFsa4510aNVgI8g/+SzA==", + "version": "1.16.1", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz", + "integrity": "sha512-HXXqVUq7+pcKeLqqZj6mHFUMvXtOJt1uoUx09pFW6011inTMxqI8BA8PM95myrIyyKwdnzjdFjLiE6KBPVtJIg==", "dev": true, "requires": { "asn1": "~0.2.3", @@ -13413,15 +11736,9 @@ "dev": true }, "stack-utils": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-1.0.1.tgz", - "integrity": "sha1-1PM6tU6OOHeLDKXP07OvsS22hiA=", - "dev": true - }, - "staged-git-files": { - "version": "1.1.1", - "resolved": "http://registry.npmjs.org/staged-git-files/-/staged-git-files-1.1.1.tgz", - "integrity": "sha512-H89UNKr1rQJvI1c/PIR3kiAMBV23yvR7LItZiV74HWZwzt7f3YHuujJ9nJZlt58WlFox7XQsOahexwk7nTe69A==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-1.0.2.tgz", + "integrity": "sha512-MTX+MeG5U994cazkjd/9KNAapsHnibjMLnfXodlkXw76JEea0UiNzrqidzo1emMwk7w5Qhc9jd4Bn9TBb1MFwA==", "dev": true }, "static-extend": { @@ -13452,9 +11769,9 @@ "dev": true }, "stream-browserify": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", - "integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.2.tgz", + "integrity": "sha512-nX6hmklHs/gr2FuxYDltq8fJA1GDlxKQCz8O/IM4atRqBH8OORmBNgfvW5gG10GT/qQ9u0CzIvr2X5Pkt6ntqg==", "dev": true, "requires": { "inherits": "~2.0.1", @@ -13496,12 +11813,6 @@ "integrity": "sha1-1cdSgl5TZ+eG944Y5EXqIjoVWVI=", "dev": true }, - "string-argv": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/string-argv/-/string-argv-0.0.2.tgz", - "integrity": "sha1-2sMECGkMIfPDYwo/86BYd73L1zY=", - "dev": true - }, "string-length": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/string-length/-/string-length-2.0.0.tgz", @@ -13560,20 +11871,15 @@ "safe-buffer": "~5.1.0" } }, - "stringify-object": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/stringify-object/-/stringify-object-3.3.0.tgz", - "integrity": "sha512-rHqiFh1elqCQ9WPLIC8I0Q/g/wj5J1eMkyoiD6eoQApWHP0FtlK7rqnhmabL5VUY9JQCcqwwvlOaSuutekgyrw==", - "dev": true, - "requires": { - "get-own-enumerable-property-symbols": "^3.0.0", - "is-obj": "^1.0.1", - "is-regexp": "^1.0.0" - } + "stringify-package": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/stringify-package/-/stringify-package-1.0.0.tgz", + "integrity": "sha512-JIQqiWmLiEozOC0b0BtxZ/AOUtdUZHCBPgqIZ2kSJJqGwgb9neo44XdTHUC4HZSGqi03hOeB7W/E8rAlKnGe9g==", + "dev": true }, "strip-ansi": { "version": "3.0.1", - "resolved": "http://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=", "dev": true, "requires": { @@ -13597,7 +11903,7 @@ }, "strip-eof": { "version": "1.0.0", - "resolved": "http://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", + "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", "integrity": "sha1-u0P/VZim6wXYm1n80SnJgzE2Br8=", "dev": true }, @@ -13608,26 +11914,16 @@ "dev": true }, "strong-log-transformer": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/strong-log-transformer/-/strong-log-transformer-2.0.0.tgz", - "integrity": "sha512-FQmNqAXJgOX8ygOcvPLlGWBNT41mvNJ9ALoYf0GTwVt9t30mGTqpmp/oJx5gLcu52DXK10kS7dVWhx8aPXDTlg==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/strong-log-transformer/-/strong-log-transformer-2.1.0.tgz", + "integrity": "sha512-B3Hgul+z0L9a236FAUC9iZsL+nVHgoCJnqCbN588DjYxvGXaXaaFbfmQ/JhvKjZwsOukuR72XbHv71Qkug0HxA==", "dev": true, "requires": { - "byline": "^5.0.0", "duplexer": "^0.1.1", "minimist": "^1.2.0", "through": "^2.3.4" } }, - "subarg": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/subarg/-/subarg-1.0.0.tgz", - "integrity": "sha1-9izxdYHplrSPyWVpn1TAauJouNI=", - "dev": true, - "requires": { - "minimist": "^1.1.0" - } - }, "supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -13647,9 +11943,9 @@ } }, "symbol-observable": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.2.0.tgz", - "integrity": "sha512-e900nM8RRtGhlV36KGEU9k65K3mPb1WV70OdjfxlG2EAuM1noi/E/BaW/uMhL7bPEssK8QV57vN3esixjUvcXQ==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.0.1.tgz", + "integrity": "sha1-g0D8RwLDEi310iKI+IKD9RPT/dQ=", "dev": true }, "symbol-tree": { @@ -13708,9 +12004,9 @@ } }, "terser": { - "version": "3.10.8", - "resolved": "https://registry.npmjs.org/terser/-/terser-3.10.8.tgz", - "integrity": "sha512-GQJHWJ/vbx0EgRk+lBMONMmKaT+ifeo/XgT/hi3KpzEEFOERVyFuJSVXH8grcmJjiqKY35ds8rBCxvABUeyyuQ==", + "version": "3.14.1", + "resolved": "https://registry.npmjs.org/terser/-/terser-3.14.1.tgz", + "integrity": "sha512-NSo3E99QDbYSMeJaEk9YW2lTg3qS9V0aKGlb+PlOrei1X02r1wSBHCNX/O+yeTRFSWPKPIGj6MqvvdqV4rnVGw==", "dev": true, "requires": { "commander": "~2.17.1", @@ -13725,9 +12021,9 @@ "dev": true }, "source-map-support": { - "version": "0.5.9", - "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.9.tgz", - "integrity": "sha512-gR6Rw4MvUlYy83vP0vxoVNzM6t8MUXqNuRsuBmBHQDu1Fh6X015FrLdgoDKcNdkwGubozq0P4N0Q37UyFVr1EA==", + "version": "0.5.10", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.10.tgz", + "integrity": "sha512-YfQ3tQFTK/yzlGJuX8pTwa4tifQj4QS2Mj7UegOu8jAz59MqIiMGPXxQhVQiIMNzayuUSF/jEuVnfFF5JqybmQ==", "dev": true, "requires": { "buffer-from": "^1.0.0", @@ -13737,9 +12033,9 @@ } }, "terser-webpack-plugin": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-1.1.0.tgz", - "integrity": "sha512-61lV0DSxMAZ8AyZG7/A4a3UPlrbOBo8NIQ4tJzLPAdGOQ+yoNC7l5ijEow27lBAL2humer01KLS6bGIMYQxKoA==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-1.2.1.tgz", + "integrity": "sha512-GGSt+gbT0oKcMDmPx4SRSfJPE1XaN3kQRWG4ghxKQw9cn5G9x6aCKSsgYdvyM0na9NJ4Drv0RG6jbBByZ5CMjw==", "dev": true, "requires": { "cacache": "^11.0.2", @@ -13771,6 +12067,97 @@ "object-assign": "^4.1.0", "read-pkg-up": "^1.0.1", "require-main-filename": "^1.0.1" + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } } }, "test-value": { @@ -13801,24 +12188,24 @@ }, "through": { "version": "2.3.8", - "resolved": "http://registry.npmjs.org/through/-/through-2.3.8.tgz", + "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=", "dev": true }, "through2": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.3.tgz", - "integrity": "sha1-AARWmzfHx0ujnEPzzteNGtlBQL4=", + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.5.tgz", + "integrity": "sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ==", "dev": true, "requires": { - "readable-stream": "^2.1.5", + "readable-stream": "~2.3.6", "xtend": "~4.0.1" } }, "through2-filter": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/through2-filter/-/through2-filter-2.0.0.tgz", - "integrity": "sha1-YLxVoNrLdghdsfna6Zq0P4PWIuw=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/through2-filter/-/through2-filter-3.0.0.tgz", + "integrity": "sha512-jaRjI2WxN3W1V8/FMZ9HKIBXixtiqs3SQSX4/YGIiP3gL6djW48VoZq9tDqeCWs3MT8YY5wb/zli8VW8snY1CA==", "dev": true, "requires": { "through2": "~2.0.0", @@ -13894,6 +12281,17 @@ "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "to-regex": { @@ -13916,17 +12314,6 @@ "requires": { "is-number": "^3.0.0", "repeat-string": "^1.6.1" - }, - "dependencies": { - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - } - } } }, "to-through": { @@ -13946,432 +12333,152 @@ "requires": { "psl": "^1.1.24", "punycode": "^1.4.1" - } - }, - "tr46": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-1.0.1.tgz", - "integrity": "sha1-qLE/1r/SSJUZZ0zN5VujaTtwbQk=", - "dev": true, - "requires": { - "punycode": "^2.1.0" - }, - "dependencies": { - "punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", - "dev": true - } - } - }, - "trash": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/trash/-/trash-4.3.0.tgz", - "integrity": "sha512-f36TKwIaBiXm63xSrn8OTNghg5CYHBsFVJvcObMo76LRpgariuRi2CqXQHw1VzfeximD0igdGaonOG6N760BtQ==", - "dev": true, - "requires": { - "escape-string-applescript": "^2.0.0", - "fs-extra": "^0.30.0", - "globby": "^7.1.1", - "p-map": "^1.2.0", - "p-try": "^1.0.0", - "pify": "^3.0.0", - "run-applescript": "^3.0.0", - "uuid": "^3.1.0", - "xdg-trashdir": "^2.1.1" - }, - "dependencies": { - "fs-extra": { - "version": "0.30.0", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-0.30.0.tgz", - "integrity": "sha1-8jP/zAjU2n1DLapEl3aYnbHfk/A=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "jsonfile": "^2.1.0", - "klaw": "^1.0.0", - "path-is-absolute": "^1.0.0", - "rimraf": "^2.2.8" - } - }, - "globby": { - "version": "7.1.1", - "resolved": "https://registry.npmjs.org/globby/-/globby-7.1.1.tgz", - "integrity": "sha1-+yzP+UAfhgCUXfral0QMypcrhoA=", - "dev": true, - "requires": { - "array-union": "^1.0.1", - "dir-glob": "^2.0.0", - "glob": "^7.1.2", - "ignore": "^3.3.5", - "pify": "^3.0.0", - "slash": "^1.0.0" - } - }, - "jsonfile": { - "version": "2.4.0", - "resolved": "http://registry.npmjs.org/jsonfile/-/jsonfile-2.4.0.tgz", - "integrity": "sha1-NzaitCi4e72gzIO1P6PWM6NcKug=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.6" - } - } - } - }, - "trim-newlines": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/trim-newlines/-/trim-newlines-2.0.0.tgz", - "integrity": "sha1-tAPQuRvlDDMd/EuC7s6yLD3hbSA=", - "dev": true - }, - "trim-off-newlines": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/trim-off-newlines/-/trim-off-newlines-1.0.1.tgz", - "integrity": "sha1-n5up2e+odkw4dpi8v+sshI8RrbM=", - "dev": true - }, - "trim-right": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/trim-right/-/trim-right-1.0.1.tgz", - "integrity": "sha1-yy4SAwZ+DI3h9hQJS5/kVwTqYAM=", - "dev": true - }, - "ts-jest": { - "version": "22.4.6", - "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-22.4.6.tgz", - "integrity": "sha512-kYQ6g1G1AU+bOO9rv+SSQXg4WTcni6Wx3AM48iHni0nP1vIuhdNRjKTE9Cxx36Ix/IOV7L85iKu07dgXJzH2pQ==", - "dev": true, - "requires": { - "babel-core": "^6.26.3", - "babel-plugin-istanbul": "^4.1.6", - "babel-plugin-transform-es2015-modules-commonjs": "^6.26.2", - "babel-preset-jest": "^22.4.3", - "cpx": "^1.5.0", - "fs-extra": "6.0.0", - "jest-config": "^22.4.3", - "lodash": "^4.17.10", - "pkg-dir": "^2.0.0", - "source-map-support": "^0.5.5", - "yargs": "^11.0.0" }, - "dependencies": { - "ansi-regex": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", - "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", - "dev": true - }, - "babel-plugin-jest-hoist": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/babel-plugin-jest-hoist/-/babel-plugin-jest-hoist-22.4.4.tgz", - "integrity": "sha512-DUvGfYaAIlkdnygVIEl0O4Av69NtuQWcrjMOv6DODPuhuGLDnbsARz3AwiiI/EkIMMlxQDUcrZ9yoyJvTNjcVQ==", - "dev": true - }, - "babel-preset-jest": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-22.4.4.tgz", - "integrity": "sha512-+dxMtOFwnSYWfum0NaEc0O03oSdwBsjx4tMSChRDPGwu/4wSY6Q6ANW3wkjKpJzzguaovRs/DODcT4hbSN8yiA==", - "dev": true, - "requires": { - "babel-plugin-jest-hoist": "^22.4.4", - "babel-plugin-syntax-object-rest-spread": "^6.13.0" - } - }, - "cliui": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", - "integrity": "sha512-4FG+RSG9DL7uEwRUZXZn3SS34DiDPfzP0VOiEwtUWlE+AR2EIg+hSyvrIgUUfhdgR/UkAeW2QHgeP+hWrXs7jQ==", - "dev": true, - "requires": { - "string-width": "^2.1.1", - "strip-ansi": "^4.0.0", - "wrap-ansi": "^2.0.0" - } - }, - "expect": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/expect/-/expect-22.4.3.tgz", - "integrity": "sha512-XcNXEPehqn8b/jm8FYotdX0YrXn36qp4HWlrVT4ktwQas1l1LPxiVWncYnnL2eyMtKAmVIaG0XAp0QlrqJaxaA==", - "dev": true, - "requires": { - "ansi-styles": "^3.2.0", - "jest-diff": "^22.4.3", - "jest-get-type": "^22.4.3", - "jest-matcher-utils": "^22.4.3", - "jest-message-util": "^22.4.3", - "jest-regex-util": "^22.4.3" - } - }, - "fs-extra": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-6.0.0.tgz", - "integrity": "sha512-lk2cUCo8QzbiEWEbt7Cw3m27WMiRG321xsssbcIpfMhpRjrlC08WBOVQqj1/nQYYNnPtyIhP1oqLO3QwT2tPCw==", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "jsonfile": "^4.0.0", - "universalify": "^0.1.0" - } - }, - "is-fullwidth-code-point": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", - "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", - "dev": true - }, - "jest-config": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/jest-config/-/jest-config-22.4.4.tgz", - "integrity": "sha512-9CKfo1GC4zrXSoMLcNeDvQBfgtqGTB1uP8iDIZ97oB26RCUb886KkKWhVcpyxVDOUxbhN+uzcBCeFe7w+Iem4A==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "glob": "^7.1.1", - "jest-environment-jsdom": "^22.4.1", - "jest-environment-node": "^22.4.1", - "jest-get-type": "^22.1.0", - "jest-jasmine2": "^22.4.4", - "jest-regex-util": "^22.1.0", - "jest-resolve": "^22.4.2", - "jest-util": "^22.4.1", - "jest-validate": "^22.4.4", - "pretty-format": "^22.4.0" - } - }, - "jest-diff": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-diff/-/jest-diff-22.4.3.tgz", - "integrity": "sha512-/QqGvCDP5oZOF6PebDuLwrB2BMD8ffJv6TAGAdEVuDx1+uEgrHpSFrfrOiMRx2eJ1hgNjlQrOQEHetVwij90KA==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "diff": "^3.2.0", - "jest-get-type": "^22.4.3", - "pretty-format": "^22.4.3" - } - }, - "jest-environment-jsdom": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-22.4.3.tgz", - "integrity": "sha512-FviwfR+VyT3Datf13+ULjIMO5CSeajlayhhYQwpzgunswoaLIPutdbrnfUHEMyJCwvqQFaVtTmn9+Y8WCt6n1w==", - "dev": true, - "requires": { - "jest-mock": "^22.4.3", - "jest-util": "^22.4.3", - "jsdom": "^11.5.1" - } - }, - "jest-environment-node": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-environment-node/-/jest-environment-node-22.4.3.tgz", - "integrity": "sha512-reZl8XF6t/lMEuPWwo9OLfttyC26A5AMgDyEQ6DBgZuyfyeNUzYT8BFo6uxCCP/Av/b7eb9fTi3sIHFPBzmlRA==", - "dev": true, - "requires": { - "jest-mock": "^22.4.3", - "jest-util": "^22.4.3" - } - }, - "jest-jasmine2": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/jest-jasmine2/-/jest-jasmine2-22.4.4.tgz", - "integrity": "sha512-nK3vdUl50MuH7vj/8at7EQVjPGWCi3d5+6aCi7Gxy/XMWdOdbH1qtO/LjKbqD8+8dUAEH+BVVh7HkjpCWC1CSw==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "co": "^4.6.0", - "expect": "^22.4.0", - "graceful-fs": "^4.1.11", - "is-generator-fn": "^1.0.0", - "jest-diff": "^22.4.0", - "jest-matcher-utils": "^22.4.0", - "jest-message-util": "^22.4.0", - "jest-snapshot": "^22.4.0", - "jest-util": "^22.4.1", - "source-map-support": "^0.5.0" - } - }, - "jest-matcher-utils": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-22.4.3.tgz", - "integrity": "sha512-lsEHVaTnKzdAPR5t4B6OcxXo9Vy4K+kRRbG5gtddY8lBEC+Mlpvm1CJcsMESRjzUhzkz568exMV1hTB76nAKbA==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "jest-get-type": "^22.4.3", - "pretty-format": "^22.4.3" - } - }, - "jest-message-util": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-message-util/-/jest-message-util-22.4.3.tgz", - "integrity": "sha512-iAMeKxhB3Se5xkSjU0NndLLCHtP4n+GtCqV0bISKA5dmOXQfEbdEmYiu2qpnWBDCQdEafNDDU6Q+l6oBMd/+BA==", - "dev": true, - "requires": { - "@babel/code-frame": "^7.0.0-beta.35", - "chalk": "^2.0.1", - "micromatch": "^2.3.11", - "slash": "^1.0.0", - "stack-utils": "^1.0.1" - } - }, - "jest-mock": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-mock/-/jest-mock-22.4.3.tgz", - "integrity": "sha512-+4R6mH5M1G4NK16CKg9N1DtCaFmuxhcIqF4lQK/Q1CIotqMs/XBemfpDPeVZBFow6iyUNu6EBT9ugdNOTT5o5Q==", - "dev": true - }, - "jest-regex-util": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-regex-util/-/jest-regex-util-22.4.3.tgz", - "integrity": "sha512-LFg1gWr3QinIjb8j833bq7jtQopiwdAs67OGfkPrvy7uNUbVMfTXXcOKXJaeY5GgjobELkKvKENqq1xrUectWg==", + "dependencies": { + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", "dev": true - }, - "jest-resolve": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-resolve/-/jest-resolve-22.4.3.tgz", - "integrity": "sha512-u3BkD/MQBmwrOJDzDIaxpyqTxYH+XqAXzVJP51gt29H8jpj3QgKof5GGO2uPGKGeA1yTMlpbMs1gIQ6U4vcRhw==", - "dev": true, - "requires": { - "browser-resolve": "^1.11.2", - "chalk": "^2.0.1" - } - }, - "jest-snapshot": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-snapshot/-/jest-snapshot-22.4.3.tgz", - "integrity": "sha512-JXA0gVs5YL0HtLDCGa9YxcmmV2LZbwJ+0MfyXBBc5qpgkEYITQFJP7XNhcHFbUvRiniRpRbGVfJrOoYhhGE0RQ==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "jest-diff": "^22.4.3", - "jest-matcher-utils": "^22.4.3", - "mkdirp": "^0.5.1", - "natural-compare": "^1.4.0", - "pretty-format": "^22.4.3" - } - }, - "jest-util": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-util/-/jest-util-22.4.3.tgz", - "integrity": "sha512-rfDfG8wyC5pDPNdcnAlZgwKnzHvZDu8Td2NJI/jAGKEGxJPYiE4F0ss/gSAkG4778Y23Hvbz+0GMrDJTeo7RjQ==", - "dev": true, - "requires": { - "callsites": "^2.0.0", - "chalk": "^2.0.1", - "graceful-fs": "^4.1.11", - "is-ci": "^1.0.10", - "jest-message-util": "^22.4.3", - "mkdirp": "^0.5.1", - "source-map": "^0.6.0" - } - }, - "jest-validate": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/jest-validate/-/jest-validate-22.4.4.tgz", - "integrity": "sha512-dmlf4CIZRGvkaVg3fa0uetepcua44DHtktHm6rcoNVtYlpwe6fEJRkMFsaUVcFHLzbuBJ2cPw9Gl9TKfnzMVwg==", + } + } + }, + "tr46": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-1.0.1.tgz", + "integrity": "sha1-qLE/1r/SSJUZZ0zN5VujaTtwbQk=", + "dev": true, + "requires": { + "punycode": "^2.1.0" + } + }, + "trash": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/trash/-/trash-4.3.0.tgz", + "integrity": "sha512-f36TKwIaBiXm63xSrn8OTNghg5CYHBsFVJvcObMo76LRpgariuRi2CqXQHw1VzfeximD0igdGaonOG6N760BtQ==", + "dev": true, + "requires": { + "escape-string-applescript": "^2.0.0", + "fs-extra": "^0.30.0", + "globby": "^7.1.1", + "p-map": "^1.2.0", + "p-try": "^1.0.0", + "pify": "^3.0.0", + "run-applescript": "^3.0.0", + "uuid": "^3.1.0", + "xdg-trashdir": "^2.1.1" + }, + "dependencies": { + "fs-extra": { + "version": "0.30.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-0.30.0.tgz", + "integrity": "sha1-8jP/zAjU2n1DLapEl3aYnbHfk/A=", "dev": true, "requires": { - "chalk": "^2.0.1", - "jest-config": "^22.4.4", - "jest-get-type": "^22.1.0", - "leven": "^2.1.0", - "pretty-format": "^22.4.0" + "graceful-fs": "^4.1.2", + "jsonfile": "^2.1.0", + "klaw": "^1.0.0", + "path-is-absolute": "^1.0.0", + "rimraf": "^2.2.8" } }, - "os-locale": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", - "integrity": "sha512-3sslG3zJbEYcaC4YVAvDorjGxc7tv6KVATnLPZONiljsUncvihe9BQoVCEs0RZ1kmf4Hk9OBqlZfJZWI4GanKA==", + "globby": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/globby/-/globby-7.1.1.tgz", + "integrity": "sha1-+yzP+UAfhgCUXfral0QMypcrhoA=", "dev": true, "requires": { - "execa": "^0.7.0", - "lcid": "^1.0.0", - "mem": "^1.1.0" + "array-union": "^1.0.1", + "dir-glob": "^2.0.0", + "glob": "^7.1.2", + "ignore": "^3.3.5", + "pify": "^3.0.0", + "slash": "^1.0.0" } }, - "pretty-format": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/pretty-format/-/pretty-format-22.4.3.tgz", - "integrity": "sha512-S4oT9/sT6MN7/3COoOy+ZJeA92VmOnveLHgrwBE3Z1W5N9S2A1QGNYiE1z75DAENbJrXXUb+OWXhpJcg05QKQQ==", + "jsonfile": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-2.4.0.tgz", + "integrity": "sha1-NzaitCi4e72gzIO1P6PWM6NcKug=", "dev": true, "requires": { - "ansi-regex": "^3.0.0", - "ansi-styles": "^3.2.0" + "graceful-fs": "^4.1.6" } - }, - "source-map": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", - "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + } + } + }, + "trim-newlines": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/trim-newlines/-/trim-newlines-2.0.0.tgz", + "integrity": "sha1-tAPQuRvlDDMd/EuC7s6yLD3hbSA=", + "dev": true + }, + "trim-off-newlines": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/trim-off-newlines/-/trim-off-newlines-1.0.1.tgz", + "integrity": "sha1-n5up2e+odkw4dpi8v+sshI8RrbM=", + "dev": true + }, + "trim-right": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/trim-right/-/trim-right-1.0.1.tgz", + "integrity": "sha1-yy4SAwZ+DI3h9hQJS5/kVwTqYAM=", + "dev": true + }, + "ts-jest": { + "version": "23.10.5", + "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-23.10.5.tgz", + "integrity": "sha512-MRCs9qnGoyKgFc8adDEntAOP64fWK1vZKnOYU1o2HxaqjdJvGqmkLCPCnVq1/If4zkUmEjKPnCiUisTrlX2p2A==", + "dev": true, + "requires": { + "bs-logger": "0.x", + "buffer-from": "1.x", + "fast-json-stable-stringify": "2.x", + "json5": "2.x", + "make-error": "1.x", + "mkdirp": "0.x", + "resolve": "1.x", + "semver": "^5.5", + "yargs-parser": "10.x" + }, + "dependencies": { + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", "dev": true }, - "source-map-support": { - "version": "0.5.9", - "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.9.tgz", - "integrity": "sha512-gR6Rw4MvUlYy83vP0vxoVNzM6t8MUXqNuRsuBmBHQDu1Fh6X015FrLdgoDKcNdkwGubozq0P4N0Q37UyFVr1EA==", - "dev": true, - "requires": { - "buffer-from": "^1.0.0", - "source-map": "^0.6.0" - } - }, - "string-width": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", - "integrity": "sha512-nOqH59deCq9SRHlxq1Aw85Jnt4w6KvLKqWVik6oA9ZklXLNIOlqg4F2yrT1MVaTjAqvVwdfeZ7w7aCvJD7ugkw==", - "dev": true, - "requires": { - "is-fullwidth-code-point": "^2.0.0", - "strip-ansi": "^4.0.0" - } - }, - "strip-ansi": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", - "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "json5": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.1.0.tgz", + "integrity": "sha512-8Mh9h6xViijj36g7Dxi+Y4S6hNGV96vcJZr/SrlHh1LR/pEn/8j/+qIBbs44YKl69Lrfctp4QD+AdWLTMqEZAQ==", "dev": true, "requires": { - "ansi-regex": "^3.0.0" + "minimist": "^1.2.0" } }, - "yargs": { - "version": "11.1.0", - "resolved": "http://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", - "integrity": "sha512-NwW69J42EsCSanF8kyn5upxvjp5ds+t3+udGBeTbFnERA+lF541DDpMawzo4z6W/QrzNM18D+BPMiOBibnFV5A==", + "yargs-parser": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-10.1.0.tgz", + "integrity": "sha512-VCIyR1wJoEBZUqk5PA+oOBF6ypbwh5aNB3I50guxAL/quggdfs4TtNHQrSazFA3fYZ+tEqfs0zIGlv0c/rgjbQ==", "dev": true, "requires": { - "cliui": "^4.0.0", - "decamelize": "^1.1.1", - "find-up": "^2.1.0", - "get-caller-file": "^1.0.1", - "os-locale": "^2.0.0", - "require-directory": "^2.1.1", - "require-main-filename": "^1.0.1", - "set-blocking": "^2.0.0", - "string-width": "^2.0.0", - "which-module": "^2.0.0", - "y18n": "^3.2.1", - "yargs-parser": "^9.0.2" + "camelcase": "^4.1.0" } } } }, "ts-node": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-7.0.1.tgz", - "integrity": "sha512-BVwVbPJRspzNh2yfslyT1PSbl5uIk03EZlb493RKHN4qej/D06n1cEhjlOJG69oFsE7OT8XjpTUcYf6pKTLMhw==", + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-8.0.2.tgz", + "integrity": "sha512-MosTrinKmaAcWgO8tqMjMJB22h+sp3Rd1i4fdoWY4mhBDekOwIAKI/bzmRi7IcbCmjquccYg2gcF6NBkLgr0Tw==", "dev": true, "requires": { - "arrify": "^1.0.0", - "buffer-from": "^1.1.0", + "arg": "^4.1.0", "diff": "^3.1.0", "make-error": "^1.1.1", - "minimist": "^1.2.0", - "mkdirp": "^0.5.1", "source-map-support": "^0.5.6", - "yn": "^2.0.0" + "yn": "^3.0.0" }, "dependencies": { "source-map": { @@ -14381,9 +12488,9 @@ "dev": true }, "source-map-support": { - "version": "0.5.9", - "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.9.tgz", - "integrity": "sha512-gR6Rw4MvUlYy83vP0vxoVNzM6t8MUXqNuRsuBmBHQDu1Fh6X015FrLdgoDKcNdkwGubozq0P4N0Q37UyFVr1EA==", + "version": "0.5.10", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.10.tgz", + "integrity": "sha512-YfQ3tQFTK/yzlGJuX8pTwa4tifQj4QS2Mj7UegOu8jAz59MqIiMGPXxQhVQiIMNzayuUSF/jEuVnfFF5JqybmQ==", "dev": true, "requires": { "buffer-from": "^1.0.0", @@ -14398,9 +12505,9 @@ "integrity": "sha512-4krF8scpejhaOgqzBEcGM7yDIEfi0/8+8zDRZhNZZ2kjmHJ4hv3zCbQWxoJGz1iw5U0Jl0nma13xzHXcncMavQ==" }, "tslint": { - "version": "5.11.0", - "resolved": "https://registry.npmjs.org/tslint/-/tslint-5.11.0.tgz", - "integrity": "sha1-mPMMAurjzecAYgHkwzywi0hYHu0=", + "version": "5.12.1", + "resolved": "https://registry.npmjs.org/tslint/-/tslint-5.12.1.tgz", + "integrity": "sha512-sfodBHOucFg6egff8d1BvuofoOQ/nOeYNfbp7LDlKBcLNrL3lmS5zoiDGyOMdT7YsEXAwWpTdAHwOGOc8eRZAw==", "dev": true, "requires": { "babel-code-frame": "^6.22.0", @@ -14463,9 +12570,9 @@ "dev": true }, "typedoc": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.12.0.tgz", - "integrity": "sha512-dsdlaYZ7Je8JC+jQ3j2Iroe4uyD0GhqzADNUVyBRgLuytQDP/g0dPkAw5PdM/4drnmmJjRzSWW97FkKo+ITqQg==", + "version": "0.14.2", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.14.2.tgz", + "integrity": "sha512-aEbgJXV8/KqaVhcedT7xG6d2r+mOvB5ep3eIz1KuB5sc4fDYXcepEEMdU7XSqLFO5hVPu0nllHi1QxX2h/QlpQ==", "dev": true, "requires": { "@types/fs-extra": "^5.0.3", @@ -14477,14 +12584,14 @@ "@types/shelljs": "^0.8.0", "fs-extra": "^7.0.0", "handlebars": "^4.0.6", - "highlight.js": "^9.0.0", + "highlight.js": "^9.13.1", "lodash": "^4.17.10", "marked": "^0.4.0", "minimatch": "^3.0.0", "progress": "^2.0.0", "shelljs": "^0.8.2", "typedoc-default-themes": "^0.5.0", - "typescript": "3.0.x" + "typescript": "3.2.x" } }, "typedoc-default-themes": { @@ -14494,9 +12601,9 @@ "dev": true }, "typescript": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.0.3.tgz", - "integrity": "sha512-kk80vLW9iGtjMnIv11qyxLqZm20UklzuR2tL0QAnDIygIUIemcZMxlMWudl9OOt76H3ntVzcTiddQ1/pAAJMYg==", + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.2.4.tgz", + "integrity": "sha512-0RNDbSdEokBeEAkgNbxJ+BLwSManFy9TeXz8uW+48j/xhEXv1ePME60olyzw2XzUqUBNAYFeJadIqAgNqIACwg==", "dev": true }, "typical": { @@ -14524,155 +12631,6 @@ } } }, - "uglifyjs-webpack-plugin": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/uglifyjs-webpack-plugin/-/uglifyjs-webpack-plugin-1.3.0.tgz", - "integrity": "sha512-ovHIch0AMlxjD/97j9AYovZxG5wnHOPkL7T1GKochBADp/Zwc44pEWNqpKl1Loupp1WhFg7SlYmHZRUfdAacgw==", - "dev": true, - "requires": { - "cacache": "^10.0.4", - "find-cache-dir": "^1.0.0", - "schema-utils": "^0.4.5", - "serialize-javascript": "^1.4.0", - "source-map": "^0.6.1", - "uglify-es": "^3.3.4", - "webpack-sources": "^1.1.0", - "worker-farm": "^1.5.2" - }, - "dependencies": { - "ajv": { - "version": "6.6.1", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.1.tgz", - "integrity": "sha512-ZoJjft5B+EJBjUyu9C9Hc0OZyPZSSlOF+plzouTrg6UlA8f+e/n8NIgBFG/9tppJtpPWfthHakK7juJdNDODww==", - "dev": true, - "requires": { - "fast-deep-equal": "^2.0.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "cacache": { - "version": "10.0.4", - "resolved": "https://registry.npmjs.org/cacache/-/cacache-10.0.4.tgz", - "integrity": "sha512-Dph0MzuH+rTQzGPNT9fAnrPmMmjKfST6trxJeK7NQuHRaVw24VzPRWTmg9MpcwOVQZO0E1FBICUlFeNaKPIfHA==", - "dev": true, - "requires": { - "bluebird": "^3.5.1", - "chownr": "^1.0.1", - "glob": "^7.1.2", - "graceful-fs": "^4.1.11", - "lru-cache": "^4.1.1", - "mississippi": "^2.0.0", - "mkdirp": "^0.5.1", - "move-concurrently": "^1.0.1", - "promise-inflight": "^1.0.1", - "rimraf": "^2.6.2", - "ssri": "^5.2.4", - "unique-filename": "^1.1.0", - "y18n": "^4.0.0" - } - }, - "commander": { - "version": "2.13.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-2.13.0.tgz", - "integrity": "sha512-MVuS359B+YzaWqjCL/c+22gfryv+mCBPHAv3zyVI2GN8EY6IRP8VwtasXn8jyyhvvq84R4ImN1OKRtcbIasjYA==", - "dev": true - }, - "fast-deep-equal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", - "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", - "dev": true - }, - "find-cache-dir": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-1.0.0.tgz", - "integrity": "sha1-kojj6ePMN0hxfTnq3hfPcfww7m8=", - "dev": true, - "requires": { - "commondir": "^1.0.1", - "make-dir": "^1.0.0", - "pkg-dir": "^2.0.0" - } - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true - }, - "mississippi": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mississippi/-/mississippi-2.0.0.tgz", - "integrity": "sha512-zHo8v+otD1J10j/tC+VNoGK9keCuByhKovAvdn74dmxJl9+mWHnx6EMsDN4lgRoMI/eYo2nchAxniIbUPb5onw==", - "dev": true, - "requires": { - "concat-stream": "^1.5.0", - "duplexify": "^3.4.2", - "end-of-stream": "^1.1.0", - "flush-write-stream": "^1.0.0", - "from2": "^2.1.0", - "parallel-transform": "^1.1.0", - "pump": "^2.0.1", - "pumpify": "^1.3.3", - "stream-each": "^1.1.0", - "through2": "^2.0.0" - } - }, - "pump": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", - "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", - "dev": true, - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, - "schema-utils": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.7.tgz", - "integrity": "sha512-v/iwU6wvwGK8HbU9yi3/nhGzP0yGSuhQMzL6ySiec1FSrZZDkhm4noOSWzrNFo/jEc+SJY6jRTwuwbSXJPDUnQ==", - "dev": true, - "requires": { - "ajv": "^6.1.0", - "ajv-keywords": "^3.1.0" - } - }, - "source-map": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", - "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", - "dev": true - }, - "ssri": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/ssri/-/ssri-5.3.0.tgz", - "integrity": "sha512-XRSIPqLij52MtgoQavH/x/dU1qVKtWUAAZeOHsR9c2Ddi4XerFy3mc1alf+dLJKl9EUIm/Ht+EowFkTUOA6GAQ==", - "dev": true, - "requires": { - "safe-buffer": "^5.1.1" - } - }, - "uglify-es": { - "version": "3.3.9", - "resolved": "https://registry.npmjs.org/uglify-es/-/uglify-es-3.3.9.tgz", - "integrity": "sha512-r+MU0rfv4L/0eeW3xZrd16t4NZfK8Ld4SWVglYBb7ez5uXFWHuVRs6xCTrf1yirs9a4j4Y27nn7SRfO6v67XsQ==", - "dev": true, - "requires": { - "commander": "~2.13.0", - "source-map": "~0.6.1" - } - }, - "y18n": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", - "integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==", - "dev": true - } - } - }, "uid-number": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/uid-number/-/uid-number-0.0.6.tgz", @@ -14768,13 +12726,13 @@ } }, "unique-stream": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/unique-stream/-/unique-stream-2.2.1.tgz", - "integrity": "sha1-WqADz76Uxf+GbE59ZouxxNuts2k=", + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/unique-stream/-/unique-stream-2.3.1.tgz", + "integrity": "sha512-2nY4TnBE70yoxHkDli7DMazpWiP7xMdCYqU2nBRO0UB+ZpEkGsSija7MvmvnZFUeC+mrgiUfcHSr3LmRFIg4+A==", "dev": true, "requires": { - "json-stable-stringify": "^1.0.0", - "through2-filter": "^2.0.0" + "json-stable-stringify-without-jsonify": "^1.0.1", + "through2-filter": "^3.0.0" } }, "universalify": { @@ -14820,12 +12778,6 @@ "resolved": "https://registry.npmjs.org/has-values/-/has-values-0.1.4.tgz", "integrity": "sha1-bWHeldkd/Km5oCCJrThL/49it3E=", "dev": true - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true } } }, @@ -14842,14 +12794,6 @@ "dev": true, "requires": { "punycode": "^2.1.0" - }, - "dependencies": { - "punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", - "dev": true - } } }, "urix": { @@ -14882,10 +12826,19 @@ "integrity": "sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==", "dev": true }, + "user-home": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/user-home/-/user-home-2.0.0.tgz", + "integrity": "sha1-nHC/2Babwdy/SGBODwS4tJzenp8=", + "dev": true, + "requires": { + "os-homedir": "^1.0.0" + } + }, "util": { - "version": "0.10.4", - "resolved": "https://registry.npmjs.org/util/-/util-0.10.4.tgz", - "integrity": "sha512-0Pm9hTQ3se5ll1XihRic3FDIku70C+iHUdT/W926rSgHV5QgXsYbKZN8MSC3tJtSkhuROzvsQjAaFENRXr+19A==", + "version": "0.11.1", + "resolved": "https://registry.npmjs.org/util/-/util-0.11.1.tgz", + "integrity": "sha512-HShAsny+zS2TZfaXxD9tYj4HQGlBezXZMZuM/S5PKLLoZkShZiGk9o5CzukI1LVHZvjdvZ2Sj1aW/Ndn2NB/HQ==", "dev": true, "requires": { "inherits": "2.0.3" @@ -14914,9 +12867,9 @@ "dev": true }, "v8flags": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/v8flags/-/v8flags-3.1.1.tgz", - "integrity": "sha512-iw/1ViSEaff8NJ3HLyEjawk/8hjJib3E7pvG4pddVXfUg1983s3VGsiClDjhK64MQVDGqc1Q8r18S4VKQZS9EQ==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/v8flags/-/v8flags-3.1.2.tgz", + "integrity": "sha512-MtivA7GF24yMPte9Rp/BWGCYQNaUj86zeYxV/x2RRJMKagImbbv3u8iJC57lNhWLPcGLJmHcHmFWkNsplbbLWw==", "dev": true, "requires": { "homedir-polyfill": "^1.0.1" @@ -14983,433 +12936,90 @@ "graceful-fs": "^4.0.0", "is-valid-glob": "^1.0.0", "lazystream": "^1.0.0", - "lead": "^1.0.0", - "object.assign": "^4.0.4", - "pumpify": "^1.3.5", - "readable-stream": "^2.3.3", - "remove-bom-buffer": "^3.0.0", - "remove-bom-stream": "^1.2.0", - "resolve-options": "^1.1.0", - "through2": "^2.0.0", - "to-through": "^2.0.0", - "value-or-function": "^3.0.0", - "vinyl": "^2.0.0", - "vinyl-sourcemap": "^1.1.0" - } - }, - "vinyl-sourcemap": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/vinyl-sourcemap/-/vinyl-sourcemap-1.1.0.tgz", - "integrity": "sha1-kqgAWTo4cDqM2xHYswCtS+Y7PhY=", - "dev": true, - "requires": { - "append-buffer": "^1.0.2", - "convert-source-map": "^1.5.0", - "graceful-fs": "^4.1.6", - "normalize-path": "^2.1.1", - "now-and-later": "^2.0.0", - "remove-bom-buffer": "^3.0.0", - "vinyl": "^2.0.0" - } - }, - "vinyl-sourcemaps-apply": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/vinyl-sourcemaps-apply/-/vinyl-sourcemaps-apply-0.2.1.tgz", - "integrity": "sha1-q2VJ1h0XLCsbh75cUI0jnI74dwU=", - "dev": true, - "requires": { - "source-map": "^0.5.1" - } - }, - "vm-browserify": { - "version": "0.0.4", - "resolved": "https://registry.npmjs.org/vm-browserify/-/vm-browserify-0.0.4.tgz", - "integrity": "sha1-XX6kW7755Kb/ZflUOOCofDV9WnM=", - "dev": true, - "requires": { - "indexof": "0.0.1" - } - }, - "w3c-hr-time": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.1.tgz", - "integrity": "sha1-gqwr/2PZUOqeMYmlimViX+3xkEU=", - "dev": true, - "requires": { - "browser-process-hrtime": "^0.1.2" - } - }, - "walker": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.7.tgz", - "integrity": "sha1-L3+bj9ENZ3JisYqITijRlhjgKPs=", - "dev": true, - "requires": { - "makeerror": "1.0.x" - } - }, - "watch": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/watch/-/watch-0.18.0.tgz", - "integrity": "sha1-KAlUdsbffJDJYxOJkMClQj60uYY=", - "dev": true, - "requires": { - "exec-sh": "^0.2.0", - "minimist": "^1.2.0" - } - }, - "watchpack": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-1.6.0.tgz", - "integrity": "sha512-i6dHe3EyLjMmDlU1/bGQpEw25XSjkJULPuAVKCbNRefQVq48yXKUpwg538F7AZTf9kyr57zj++pQFltUa5H7yA==", - "dev": true, - "requires": { - "chokidar": "^2.0.2", - "graceful-fs": "^4.1.2", - "neo-async": "^2.5.0" - }, - "dependencies": { - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "chokidar": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", - "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", - "dev": true, - "requires": { - "anymatch": "^2.0.0", - "async-each": "^1.0.0", - "braces": "^2.3.0", - "fsevents": "^1.2.2", - "glob-parent": "^3.1.0", - "inherits": "^2.0.1", - "is-binary-path": "^1.0.0", - "is-glob": "^4.0.0", - "lodash.debounce": "^4.0.8", - "normalize-path": "^2.1.1", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0", - "upath": "^1.0.5" - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } + "lead": "^1.0.0", + "object.assign": "^4.0.4", + "pumpify": "^1.3.5", + "readable-stream": "^2.3.3", + "remove-bom-buffer": "^3.0.0", + "remove-bom-stream": "^1.2.0", + "resolve-options": "^1.1.0", + "through2": "^2.0.0", + "to-through": "^2.0.0", + "value-or-function": "^3.0.0", + "vinyl": "^2.0.0", + "vinyl-sourcemap": "^1.1.0" + } + }, + "vinyl-sourcemap": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/vinyl-sourcemap/-/vinyl-sourcemap-1.1.0.tgz", + "integrity": "sha1-kqgAWTo4cDqM2xHYswCtS+Y7PhY=", + "dev": true, + "requires": { + "append-buffer": "^1.0.2", + "convert-source-map": "^1.5.0", + "graceful-fs": "^4.1.6", + "normalize-path": "^2.1.1", + "now-and-later": "^2.0.0", + "remove-bom-buffer": "^3.0.0", + "vinyl": "^2.0.0" + } + }, + "vinyl-sourcemaps-apply": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/vinyl-sourcemaps-apply/-/vinyl-sourcemaps-apply-0.2.1.tgz", + "integrity": "sha1-q2VJ1h0XLCsbh75cUI0jnI74dwU=", + "dev": true, + "requires": { + "source-map": "^0.5.1" + } + }, + "vm-browserify": { + "version": "0.0.4", + "resolved": "https://registry.npmjs.org/vm-browserify/-/vm-browserify-0.0.4.tgz", + "integrity": "sha1-XX6kW7755Kb/ZflUOOCofDV9WnM=", + "dev": true, + "requires": { + "indexof": "0.0.1" + } + }, + "w3c-hr-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.1.tgz", + "integrity": "sha1-gqwr/2PZUOqeMYmlimViX+3xkEU=", + "dev": true, + "requires": { + "browser-process-hrtime": "^0.1.2" + } + }, + "walker": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.7.tgz", + "integrity": "sha1-L3+bj9ENZ3JisYqITijRlhjgKPs=", + "dev": true, + "requires": { + "makeerror": "1.0.x" + } + }, + "watch": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/watch/-/watch-0.18.0.tgz", + "integrity": "sha1-KAlUdsbffJDJYxOJkMClQj60uYY=", + "dev": true, + "requires": { + "exec-sh": "^0.2.0", + "minimist": "^1.2.0" + } + }, + "watchpack": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-1.6.0.tgz", + "integrity": "sha512-i6dHe3EyLjMmDlU1/bGQpEw25XSjkJULPuAVKCbNRefQVq48yXKUpwg538F7AZTf9kyr57zj++pQFltUa5H7yA==", + "dev": true, + "requires": { + "chokidar": "^2.0.2", + "graceful-fs": "^4.1.2", + "neo-async": "^2.5.0" } }, "wcwidth": { @@ -15421,6 +13031,12 @@ "defaults": "^1.0.3" } }, + "web-stream-tools": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/web-stream-tools/-/web-stream-tools-0.0.1.tgz", + "integrity": "sha512-MZUYhvTAMMy1u07OJL2pyp/tdrIu15fRJlGgnfvCQVXBS4cBNbIV1+6veYfVhTfnq0ZLispgx4nv17QxpuX+6w==", + "dev": true + }, "webidl-conversions": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-4.0.2.tgz", @@ -15428,17 +13044,17 @@ "dev": true }, "webpack": { - "version": "4.23.1", - "resolved": "https://registry.npmjs.org/webpack/-/webpack-4.23.1.tgz", - "integrity": "sha512-iE5Cu4rGEDk7ONRjisTOjVHv3dDtcFfwitSxT7evtYj/rANJpt1OuC/Kozh1pBa99AUBr1L/LsaNB+D9Xz3CEg==", + "version": "4.29.0", + "resolved": "https://registry.npmjs.org/webpack/-/webpack-4.29.0.tgz", + "integrity": "sha512-pxdGG0keDBtamE1mNvT5zyBdx+7wkh6mh7uzMOo/uRQ/fhsdj5FXkh/j5mapzs060forql1oXqXN9HJGju+y7w==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-module-context": "1.7.10", - "@webassemblyjs/wasm-edit": "1.7.10", - "@webassemblyjs/wasm-parser": "1.7.10", - "acorn": "^5.6.2", - "acorn-dynamic-import": "^3.0.0", + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-module-context": "1.7.11", + "@webassemblyjs/wasm-edit": "1.7.11", + "@webassemblyjs/wasm-parser": "1.7.11", + "acorn": "^6.0.5", + "acorn-dynamic-import": "^4.0.0", "ajv": "^6.1.0", "ajv-keywords": "^3.1.0", "chrome-trace-event": "^1.0.0", @@ -15454,309 +13070,17 @@ "node-libs-browser": "^2.0.0", "schema-utils": "^0.4.4", "tapable": "^1.1.0", - "uglifyjs-webpack-plugin": "^1.2.4", + "terser-webpack-plugin": "^1.1.0", "watchpack": "^1.5.0", "webpack-sources": "^1.3.0" }, "dependencies": { - "ajv": { - "version": "6.6.1", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.1.tgz", - "integrity": "sha512-ZoJjft5B+EJBjUyu9C9Hc0OZyPZSSlOF+plzouTrg6UlA8f+e/n8NIgBFG/9tppJtpPWfthHakK7juJdNDODww==", - "dev": true, - "requires": { - "fast-deep-equal": "^2.0.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fast-deep-equal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", - "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", - "dev": true - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "acorn": { + "version": "6.0.5", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.0.5.tgz", + "integrity": "sha512-i33Zgp3XWtmZBMNvCr4azvOFeWVw1Rk6p3hfi3LUDvIFraOMywb1kAtrbi+med14m4Xfpqm3zRZMT+c0FNE7kg==", "dev": true }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - }, "schema-utils": { "version": "0.4.7", "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.7.tgz", @@ -15797,9 +13121,9 @@ } }, "whatwg-mimetype": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.2.0.tgz", - "integrity": "sha512-5YSO1nMd5D1hY3WzAQV3PzZL83W3YeyR1yW9PcH26Weh1t+Vzh9B6XkDh7aXm83HBZ4nSMvkjvN2H2ySWIvBgw==", + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.3.0.tgz", + "integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g==", "dev": true }, "whatwg-url": { @@ -15823,9 +13147,9 @@ } }, "which-module": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", - "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-1.0.0.tgz", + "integrity": "sha1-u6Y8qGGUiZT/MHc2CJ47lgJsKk8=", "dev": true }, "wide-align": { @@ -15863,7 +13187,7 @@ }, "wrap-ansi": { "version": "2.1.0", - "resolved": "http://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", "integrity": "sha1-2Pw9KE3QV5T+hJc8rs3Rz4JP3YU=", "dev": true, "requires": { @@ -15878,9 +13202,9 @@ "dev": true }, "write-file-atomic": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-2.3.0.tgz", - "integrity": "sha512-xuPeK4OdjWqtfi59ylvVL0Yn35SF3zgcAcv7rBPFHVaEapaDr4GdGgm3j7ckTwH9wHL7fGmgfAnb0+THrHb8tA==", + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-2.4.2.tgz", + "integrity": "sha512-s0b6vB3xIVRLWywa6X9TOMA7k9zio0TMOsl9ZnDkliA/cfJlpHXAscj0gbHVJiTdIuAYpIyqS5GW91fqm6gG5g==", "dev": true, "requires": { "graceful-fs": "^4.1.11", @@ -15953,18 +13277,9 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true - }, - "user-home": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/user-home/-/user-home-2.0.0.tgz", - "integrity": "sha1-nHC/2Babwdy/SGBODwS4tJzenp8=", - "dev": true, - "requires": { - "os-homedir": "^1.0.0" - } } } }, @@ -15986,7 +13301,7 @@ }, "xmlbuilder": { "version": "9.0.7", - "resolved": "http://registry.npmjs.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz", "integrity": "sha1-Ey7mPS7FVlxVfiD0wi35rKaGsQ0=", "dev": true }, @@ -16027,46 +13342,21 @@ "which-module": "^1.0.0", "y18n": "^3.2.1", "yargs-parser": "^5.0.0" - }, - "dependencies": { - "which-module": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/which-module/-/which-module-1.0.0.tgz", - "integrity": "sha1-u6Y8qGGUiZT/MHc2CJ47lgJsKk8=", - "dev": true - }, - "yargs-parser": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-5.0.0.tgz", - "integrity": "sha1-J17PDX/+Bcd+ZOfIbkzZS/DhIoo=", - "dev": true, - "requires": { - "camelcase": "^3.0.0" - } - } } }, "yargs-parser": { - "version": "9.0.2", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-9.0.2.tgz", - "integrity": "sha1-nM9qQ0YP5O1Aqbto9I1DuKaMwHc=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-5.0.0.tgz", + "integrity": "sha1-J17PDX/+Bcd+ZOfIbkzZS/DhIoo=", "dev": true, "requires": { - "camelcase": "^4.1.0" - }, - "dependencies": { - "camelcase": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", - "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", - "dev": true - } + "camelcase": "^3.0.0" } }, "yn": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/yn/-/yn-2.0.0.tgz", - "integrity": "sha1-5a2ryKz0CPY4X8dklWhMiOavaJo=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/yn/-/yn-3.0.0.tgz", + "integrity": "sha512-+Wo/p5VRfxUgBUGy2j/6KX2mj9AYJWOHuhMjMcbBFc3y54o9/4buK1ksBvuiK01C3kby8DH9lSmJdSxw+4G/2Q==", "dev": true } } diff --git a/js/package.json b/js/package.json index 9f76819c2e1fd..fe37d69c72a42 100644 --- a/js/package.json +++ b/js/package.json @@ -8,10 +8,10 @@ }, "scripts": { "lerna": "lerna", - "test": "gulp test", - "build": "gulp build", - "clean": "gulp clean", - "debug": "gulp debug", + "test": "NODE_NO_WARNINGS=1 gulp test", + "build": "NODE_NO_WARNINGS=1 gulp build", + "clean": "NODE_NO_WARNINGS=1 gulp clean", + "debug": "NODE_NO_WARNINGS=1 gulp debug", "perf": "node ./perf/index.js", "test:integration": "node ./bin/integration.js --mode validate", "create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow", @@ -19,11 +19,14 @@ "clean:all": "run-p clean clean:testdata", "clean:testdata": "gulp clean:testdata", "create:testdata": "gulp create:testdata", - "test:coverage": "gulp test -t ts --coverage", - "doc": "shx rm -rf ./doc && typedoc --mode file --out doc src/Arrow.ts", - "lint": "run-p lint:*", + "test:coverage": "gulp test -t src --coverage", + "doc": "shx rm -rf ./doc && typedoc --options typedoc.js", + "lint": "run-p lint:src lint:test", + "lint:ci": "run-p lint:src:ci lint:test:ci", "lint:src": "tslint --fix --project -p tsconfig.json -c tslint.json \"src/**/*.ts\"", "lint:test": "tslint --fix --project -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"", + "lint:src:ci": "tslint --project -p tsconfig.json -c tslint.json \"src/**/*.ts\"", + "lint:test:ci": "tslint --project -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"", "prepublishOnly": "echo \"Error: do 'npm run release' instead of 'npm publish'\" && exit 1", "version": "npm install && npm run clean:all" }, @@ -53,99 +56,60 @@ "npm-release.sh" ], "dependencies": { - "@types/flatbuffers": "1.9.0", - "@types/node": "10.12.0", - "@types/text-encoding-utf-8": "1.0.1", + "@types/flatbuffers": "^1.9.0", + "@types/node": "^10.12.18", + "@types/text-encoding-utf-8": "^1.0.1", "command-line-args": "5.0.2", "command-line-usage": "5.0.5", - "flatbuffers": "1.10.2", + "flatbuffers": "^1.10.2", "json-bignum": "0.0.3", + "pad-left": "2.1.0", "text-encoding-utf-8": "1.0.2", - "tslib": "1.9.3" + "tslib": "^1.9.3" }, "devDependencies": { - "@std/esm": "0.26.0", + "@mattiasbuelens/web-streams-polyfill": "0.2.1", "@types/glob": "7.1.1", - "@types/jest": "23.3.5", + "@types/jest": "23.3.13", + "async-done": "1.3.1", "benchmark": "2.1.4", "coveralls": "3.0.2", "del": "3.0.0", + "esm": "3.1.4", "glob": "7.1.3", - "google-closure-compiler": "20181008.0.0", - "gulp": "next", - "gulp-json-transform": "0.4.5", + "google-closure-compiler": "20190121.0.0", + "gulp": "4.0.0", + "gulp-json-transform": "0.4.6", "gulp-rename": "1.4.0", "gulp-sourcemaps": "2.6.4", - "gulp-typescript": "5.0.0-alpha.3", - "ix": "2.3.5", + "gulp-typescript": "5.0.0", + "ix": "2.5.1", "jest": "23.6.0", "jest-environment-node-debug": "2.0.0", + "jest-silent-reporter": "0.1.1", "json": "9.0.6", - "lerna": "3.4.3", - "lint-staged": "7.3.0", - "merge2": "1.2.3", + "lerna": "3.10.7", + "memfs": "2.15.0", "mkdirp": "0.5.1", + "multistream": "2.1.1", "npm-run-all": "4.1.5", - "pump": "3.0.0", - "rimraf": "2.6.2", + "randomatic": "3.1.1", + "rimraf": "2.6.3", "rxjs": "5.5.11", "shx": "0.3.2", "source-map-loader": "0.2.4", - "terser-webpack-plugin": "1.1.0", + "terser-webpack-plugin": "1.2.1", "trash": "4.3.0", - "ts-jest": "22.4.6", - "ts-node": "7.0.1", - "tslint": "5.11.0", - "typedoc": "0.12", - "typescript": "3.0.3", - "webpack": "4.23.1", + "ts-jest": "23.10.5", + "ts-node": "8.0.2", + "tslint": "5.12.1", + "typedoc": "0.14.2", + "typescript": "3.2.4", + "web-stream-tools": "0.0.1", + "webpack": "4.29.0", "xml2js": "0.4.19" }, "engines": { - "node": ">=10.0" - }, - "@std/esm": { - "warnings": false - }, - "lint-staged": { - "*.@(ts)": [ - "tslint --fix", - "git add" - ] - }, - "jest": { - "verbose": false, - "testEnvironment": "node", - "globals": { - "ts-jest": { - "skipBabel": true, - "tsConfigFile": "test/tsconfig.json" - } - }, - "roots": [ - "/test/" - ], - "moduleFileExtensions": [ - "js", - "ts", - "tsx" - ], - "coverageReporters": [ - "lcov" - ], - "coveragePathIgnorePatterns": [ - "fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", - "test\\/.*\\.(ts|tsx|js)$", - "/node_modules/" - ], - "transform": { - ".(ts|tsx)": "./node_modules/ts-jest/preprocessor.js", - ".(js|jsx)": "./node_modules/babel-jest/build/index.js" - }, - "transformIgnorePatterns": [ - "/node_modules/", - "/(es2015|esnext)/umd/" - ], - "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" + "node": ">=11.0" } } diff --git a/js/perf/index.js b/js/perf/index.js index 2c07591925328..0e9c2bd689aae 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,10 +16,10 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { predicate, Table, read: readBatches } = require('../targets/es5/umd'); -// const { predicate, Table, read: readBatches } = require('../targets/es5/cjs'); -// const { predicate, Table, read: readBatches } = require('../targets/es2015/umd'); -const { predicate, Table, read: readBatches } = require('../targets/es2015/cjs'); +// const { predicate, Table, RecordBatchReader } = require('../targets/es5/umd'); +// const { predicate, Table, RecordBatchReader } = require('../targets/es5/cjs'); +// const { predicate, Table, RecordBatchReader } = require('../targets/es2015/umd'); +const { predicate, Table, RecordBatchReader } = require('../targets/es2015/cjs'); const { col } = predicate; const Benchmark = require('benchmark'); @@ -91,7 +91,7 @@ function createReadBatchesTest(name, buffers) { return { async: true, name: `readBatches\n`, - fn() { for (recordBatch of readBatches(buffers)) {} } + fn() { for (recordBatch of RecordBatchReader.from(buffers)) {} } }; } @@ -139,34 +139,36 @@ function createDataFrameDirectCountTest(table, column, test, value) { let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column); if (test == 'gt') { - op = function () { + op = () => { sum = 0; - let batches = table.batches; + let batches = table.chunks; let numBatches = batches.length; for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches const batch = batches[batchIndex]; const vector = batch.getChildAt(colidx); // yield all indices - for (let index = -1; ++index < batch.length;) { + for (let index = -1, length = batch.length; ++index < length;) { sum += (vector.get(index) >= value); } } + return sum; } } else if (test == 'eq') { - op = function() { + op = () => { sum = 0; - let batches = table.batches; + let batches = table.chunks; let numBatches = batches.length; for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches const batch = batches[batchIndex]; const vector = batch.getChildAt(colidx); // yield all indices - for (let index = -1; ++index < batch.length;) { + for (let index = -1, length = batch.length; ++index < length;) { sum += (vector.get(index) === value); } } + return sum; } } else { throw new Error(`Unrecognized test "${test}"`); diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts new file mode 100644 index 0000000000000..f9178df91e782 --- /dev/null +++ b/js/src/Arrow.dom.ts @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './io/adapters'; +import { RecordBatchReader } from './ipc/reader'; +import { RecordBatchWriter } from './ipc/writer'; +import { toDOMStream } from './ipc/whatwg/iterable'; +import { recordBatchReaderThroughDOMStream } from './ipc/whatwg/reader'; +import { recordBatchWriterThroughDOMStream } from './ipc/whatwg/writer'; + +streamAdapters.toDOMStream = toDOMStream; +RecordBatchReader['throughDOM'] = recordBatchReaderThroughDOMStream; +RecordBatchWriter['throughDOM'] = recordBatchWriterThroughDOMStream; + +export { + ArrowType, DateUnit, IntervalUnit, MessageHeader, MetadataVersion, Precision, TimeUnit, Type, UnionMode, VectorType, + Data, + DataType, + Null, + Bool, + Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, + Float, Float16, Float32, Float64, + Utf8, + Binary, + FixedSizeBinary, + Date_, DateDay, DateMillisecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Decimal, + List, + Struct, + Union, DenseUnion, SparseUnion, + Dictionary, + Interval, IntervalDayTime, IntervalYearMonth, + FixedSizeList, + Map_, + Table, + Column, + Schema, Field, + Visitor, + Vector, + BaseVector, + BinaryVector, + BoolVector, + Chunked, + DateVector, DateDayVector, DateMillisecondVector, + DecimalVector, + DictionaryVector, + FixedSizeBinaryVector, + FixedSizeListVector, + FloatVector, Float16Vector, Float32Vector, Float64Vector, + IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector, + IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, + ListVector, + MapVector, + NullVector, + StructVector, + TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector, + TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector, + UnionVector, DenseUnionVector, SparseUnionVector, + Utf8Vector, + ByteStream, AsyncByteStream, AsyncByteQueue, ReadableSource, WritableSink, + RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader, + RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter, + MessageReader, AsyncMessageReader, JSONMessageReader, + Message, + RecordBatch, + ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions, + DataFrame, FilteredDataFrame, CountByResult, BindFunc, NextFunc, + predicate, + util +} from './Arrow'; diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js deleted file mode 100644 index 7ad066585712e..0000000000000 --- a/js/src/Arrow.externs.js +++ /dev/null @@ -1,814 +0,0 @@ -// @ts-nocheck -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* tslint:disable */ - -/** - * @fileoverview Closure Compiler externs for Arrow - * @externs - * @suppress {duplicate,checkTypes} - */ -/** @type {symbol} */ -Symbol.iterator; -/** @type {symbol} */ -Symbol.asyncIterator; - -var Table = function() {}; -/** @type {?} */ -Table.from = function() {}; -/** @type {?} */ -Table.fromVectors = function() {}; -/** @type {?} */ -Table.fromAsync = function() {}; -/** @type {?} */ -Table.fromStruct = function() {}; -/** @type {?} */ -Table.empty = function() {}; -/** @type {?} */ -Table.prototype.schema; -/** @type {?} */ -Table.prototype.length; -/** @type {?} */ -Table.prototype.numCols; -/** @type {?} */ -Table.prototype.get; -/** @type {?} */ -Table.prototype.getColumn; -/** @type {?} */ -Table.prototype.getColumnAt; -/** @type {?} */ -Table.prototype.getColumnIndex; -/** @type {?} */ -Table.prototype.toArray; -/** @type {?} */ -Table.prototype.select; -/** @type {?} */ -Table.prototype.rowsToString; -/** @type {?} */ -Table.prototype.batchesUnion; -/** @type {?} */ -Table.prototype.batches; -/** @type {?} */ -Table.prototype.countBy; -/** @type {?} */ -Table.prototype.scan; -/** @type {?} */ -Table.prototype.serialize; - -var CountByResult = function() {}; -/** @type {?} */ -CountByResult.prototype.asJSON; - -var col = function () {}; -var lit = function () {}; -var and = function () {}; -var or = function () {}; -var custom = function () {}; - -var Value = function() {}; -/** @type {?} */ -Value.prototype.ge; -/** @type {?} */ -Value.prototype.le; -/** @type {?} */ -Value.prototype.eq; -/** @type {?} */ -Value.prototype.lt; -/** @type {?} */ -Value.prototype.gt; -/** @type {?} */ -Value.prototype.ne; - -var Col = function() {}; -/** @type {?} */ -Col.prototype.bind; -var CombinationPredicate = function () {}; -/** @type {?} */ -CombinationPredicate.prototype.children; -var Or = function() {}; -var And = function() {}; -var Not = function() {}; -var GTeq = function () {}; -/** @type {?} */ -GTeq.prototype.and; -/** @type {?} */ -GTeq.prototype.or; -var LTeq = function () {}; -/** @type {?} */ -LTeq.prototype.and; -/** @type {?} */ -LTeq.prototype.or; -var Equals = function () {}; -/** @type {?} */ -Equals.prototype.and; -/** @type {?} */ -Equals.prototype.or; -var Predicate = function() {}; -/** @type {?} */ -Predicate.prototype.bind; -/** @type {?} */ -Predicate.prototype.and; -/** @type {?} */ -Predicate.prototype.or; -/** @type {?} */ -Predicate.prototype.not; -/** @type {?} */ -Predicate.prototype.ands; -var Literal = function() {}; - -var PipeIterator = function() {}; -/** @type {?} */ -PipeIterator.prototype.pipe; - -var AsyncPipeIterator = function() {}; -/** @type {?} */ -AsyncPipeIterator.prototype.pipe; - -var RecordBatch = function() {}; -/** @type {?} */ -RecordBatch.from = function() {}; -/** @type {?} */ -RecordBatch.prototype.numCols; -/** @type {?} */ -RecordBatch.prototype.length; -/** @type {?} */ -RecordBatch.prototype.schema; -/** @type {?} */ -RecordBatch.prototype.columns; -/** @type {?} */ -RecordBatch.prototype.select; - -var Vector = function() {}; -/** @type {?} */ -Vector.create = function() {}; -/** @type {?} */ -Vector.prototype.data; -/** @type {?} */ -Vector.prototype.type; -/** @type {?} */ -Vector.prototype.length; -/** @type {?} */ -Vector.prototype.nullCount; -/** @type {?} */ -Vector.prototype.nullBitmap; -/** @type {?} */ -Vector.prototype.isValid; -/** @type {?} */ -Vector.prototype.get; -/** @type {?} */ -Vector.prototype.set; -/** @type {?} */ -Vector.prototype.toArray; -/** @type {?} */ -Vector.prototype.concat; -/** @type {?} */ -Vector.prototype.slice; -/** @type {?} */ -Vector.prototype.acceptTypeVisitor; - -var BaseInt64 = function() {}; -/** @type {?} */ -BaseInt64.prototype.lessThan; -/** @type {?} */ -BaseInt64.prototype.equals; -/** @type {?} */ -BaseInt64.prototype.greaterThan; -/** @type {?} */ -BaseInt64.prototype.hex; - -var Uint64 = function() {}; -/** @type {?} */ -Uint64.add = function() {}; -/** @type {?} */ -Uint64.multiply = function() {}; -/** @type {?} */ -Uint64.from = function() {}; -/** @type {?} */ -Uint64.fromNumber = function() {}; -/** @type {?} */ -Uint64.fromString = function() {}; -/** @type {?} */ -Uint64.prototype.times; -/** @type {?} */ -Uint64.prototype.plus - -var Int64 = function() {}; -/** @type {?} */ -Int64.add = function() {}; -/** @type {?} */ -Int64.multiply = function() {}; -/** @type {?} */ -Int64.from = function() {}; -/** @type {?} */ -Int64.fromNumber = function() {}; -/** @type {?} */ -Int64.fromString = function() {}; -/** @type {?} */ -Int64.prototype.negate -/** @type {?} */ -Int64.prototype.times -/** @type {?} */ -Int64.prototype.plus -/** @type {?} */ -Int64.prototype.lessThan - -var Int128 = function() {}; -/** @type {?} */ -Int128.add = function() {}; -/** @type {?} */ -Int128.multiply = function() {}; -/** @type {?} */ -Int128.from = function() {}; -/** @type {?} */ -Int128.fromNumber = function() {}; -/** @type {?} */ -Int128.fromString = function() {}; -/** @type {?} */ -Int128.prototype.negate -/** @type {?} */ -Int128.prototype.times -/** @type {?} */ -Int128.prototype.plus -/** @type {?} */ -Int128.prototype.hex - -var packBools = function() {}; - -var Type = function() {}; -/** @type {?} */ -Type.NONE = function() {}; -/** @type {?} */ -Type.Null = function() {}; -/** @type {?} */ -Type.Int = function() {}; -/** @type {?} */ -Type.Float = function() {}; -/** @type {?} */ -Type.FloatingPoint = function() {}; -/** @type {?} */ -Type.Binary = function() {}; -/** @type {?} */ -Type.Utf8 = function() {}; -/** @type {?} */ -Type.Bool = function() {}; -/** @type {?} */ -Type.Decimal = function() {}; -/** @type {?} */ -Type.Date = function() {}; -/** @type {?} */ -Type.Time = function() {}; -/** @type {?} */ -Type.Timestamp = function() {}; -/** @type {?} */ -Type.Interval = function() {}; -/** @type {?} */ -Type.List = function() {}; -/** @type {?} */ -Type.Struct = function() {}; -/** @type {?} */ -Type.Struct_ = function() {}; -/** @type {?} */ -Type.Union = function() {}; -/** @type {?} */ -Type.FixedSizeBinary = function() {}; -/** @type {?} */ -Type.FixedSizeList = function() {}; -/** @type {?} */ -Type.Map = function() {}; -/** @type {?} */ -Type.Dictionary = function() {}; -/** @type {?} */ -Type.DenseUnion = function() {}; -/** @type {?} */ -Type.SparseUnion = function() {}; - -var DateUnit = function() {}; -/** @type {?} */ -DateUnit.DAY = function() {}; -/** @type {?} */ -DateUnit.MILLISECOND = function() {}; -var TimeUnit = function() {}; -/** @type {?} */ -TimeUnit.SECOND = function() {}; -/** @type {?} */ -TimeUnit.MILLISECOND = function() {}; -/** @type {?} */ -TimeUnit.MICROSECOND = function() {}; -/** @type {?} */ -TimeUnit.NANOSECOND = function() {}; -var Precision = function() {}; -/** @type {?} */ -Precision.HALF = function() {}; -/** @type {?} */ -Precision.SINGLE = function() {}; -/** @type {?} */ -Precision.DOUBLE = function() {}; -var UnionMode = function() {}; -/** @type {?} */ -UnionMode.Sparse = function() {}; -/** @type {?} */ -UnionMode.Dense = function() {}; -var VectorType = function() {}; -/** @type {?} */ -VectorType.OFFSET = function() {}; -/** @type {?} */ -VectorType.DATA = function() {}; -/** @type {?} */ -VectorType.VALIDITY = function() {}; -/** @type {?} */ -VectorType.TYPE = function() {}; -var IntervalUnit = function() {}; -/** @type {?} */ -IntervalUnit.YEAR_MONTH = function() {}; -/** @type {?} */ -IntervalUnit.DAY_TIME = function() {}; -var MessageHeader = function() {}; -/** @type {?} */ -MessageHeader.NONE = function() {}; -/** @type {?} */ -MessageHeader.Schema = function() {}; -/** @type {?} */ -MessageHeader.DictionaryBatch = function() {}; -/** @type {?} */ -MessageHeader.RecordBatch = function() {}; -/** @type {?} */ -MessageHeader.Tensor = function() {}; -var MetadataVersion = function() {}; -/** @type {?} */ -MetadataVersion.V1 = function() {}; -/** @type {?} */ -MetadataVersion.V2 = function() {}; -/** @type {?} */ -MetadataVersion.V3 = function() {}; -/** @type {?} */ -MetadataVersion.V4 = function() {}; - -var DataType = function() {}; -/** @type {?} */ -DataType.isNull = function() {}; -/** @type {?} */ -DataType.isInt = function() {}; -/** @type {?} */ -DataType.isFloat = function() {}; -/** @type {?} */ -DataType.isBinary = function() {}; -/** @type {?} */ -DataType.isUtf8 = function() {}; -/** @type {?} */ -DataType.isBool = function() {}; -/** @type {?} */ -DataType.isDecimal = function() {}; -/** @type {?} */ -DataType.isDate = function() {}; -/** @type {?} */ -DataType.isTime = function() {}; -/** @type {?} */ -DataType.isTimestamp = function() {}; -/** @type {?} */ -DataType.isInterval = function() {}; -/** @type {?} */ -DataType.isList = function() {}; -/** @type {?} */ -DataType.isStruct = function() {}; -/** @type {?} */ -DataType.isUnion = function() {}; -/** @type {?} */ -DataType.isDenseUnion = function() {}; -/** @type {?} */ -DataType.isSparseUnion = function() {}; -/** @type {?} */ -DataType.isFixedSizeBinary = function() {}; -/** @type {?} */ -DataType.isFixedSizeList = function() {}; -/** @type {?} */ -DataType.isMap = function() {}; -/** @type {?} */ -DataType.isDictionary = function() {}; -/** @type {?} */ -DataType.prototype.ArrayType; - -var Schema = function() {}; -/** @type {?} */ -Schema.from = function() {}; -/** @type {?} */ -Schema.prototype.fields; -/** @type {?} */ -Schema.prototype.version; -/** @type {?} */ -Schema.prototype.metadata; -/** @type {?} */ -Schema.prototype.dictionaries; -/** @type {?} */ -Schema.prototype.select; -var Field = function() {}; -/** @type {?} */ -Field.prototype.name; -/** @type {?} */ -Field.prototype.type; -/** @type {?} */ -Field.prototype.nullable; -/** @type {?} */ -Field.prototype.metadata; -var Null = function() {}; -var Int8 = function() {}; -var Int16 = function() {}; -var Int32 = function() {}; -var Int64 = function() {}; -var Uint8 = function() {}; -var Uint16 = function() {}; -var Uint32 = function() {}; -var Uint64 = function() {}; -var Float16 = function() {}; -var Float32 = function() {}; -var Float64 = function() {}; -var Binary = function() {}; -var Utf8 = function() {}; -var Bool = function() {}; -var Decimal = function() {}; -var Date_ = function() {}; -var Time = function() {}; -var Timestamp = function() {}; -var Interval = function() {}; -var List = function() {}; -var Struct = function() {}; -var Union = function() {}; -var DenseUnion = function() {}; -var SparseUnion = function() {}; -var FixedSizeBinary = function() {}; -var FixedSizeList = function() {}; -var Map_ = function() {}; -var Dictionary = function() {}; - -var BaseData = function() {}; -/** @type {?} */ -BaseData.prototype.type; -/** @type {?} */ -BaseData.prototype.clone; -/** @type {?} */ -BaseData.prototype.slice; -/** @type {?} */ -BaseData.prototype.length; -/** @type {?} */ -BaseData.prototype.offset; -/** @type {?} */ -BaseData.prototype.typeId; -/** @type {?} */ -BaseData.prototype.childData; -/** @type {?} */ -BaseData.prototype.nullBitmap; -/** @type {?} */ -BaseData.prototype.nullCount; - -var BoolData = function() {}; -var NestedData = function() {}; -var SparseUnionData = function() {}; -var ChunkedData = function() {}; - -var FlatData = function() {}; -/** @type {?} */ -FlatData.prototype.values; - -var FlatListData = function() {}; -/** @type {?} */ -FlatListData.prototype.values; -/** @type {?} */ -FlatListData.prototype.valueOffsets; - -var DictionaryData = function() {}; -/** @type {?} */ -DictionaryData.prototype.indices; -/** @type {?} */ -DictionaryData.prototype.dictionary; - -var ListData = function() {}; -/** @type {?} */ -ListData.prototype.values; -/** @type {?} */ -ListData.prototype.valueOffsets; - -var UnionData = function() {}; -/** @type {?} */ -UnionData.prototype.typeIds; - -var DenseUnionData = function() {}; -/** @type {?} */ -DenseUnionData.prototype.valueOffsets; - -var ChunkedData = function() {}; -/** @type {?} */ -ChunkedData.computeOffsets = function() {}; - -var FlatVector = function() {}; -/** @type {?} */ -FlatVector.prototype.values; -/** @type {?} */ -FlatVector.prototype.lows; -/** @type {?} */ -FlatVector.prototype.highs; -/** @type {?} */ -FlatVector.prototype.asInt32; - -var ListVectorBase = function() {}; -/** @type {?} */ -ListVectorBase.prototype.values; -/** @type {?} */ -ListVectorBase.prototype.valueOffsets; -/** @type {?} */ -ListVectorBase.prototype.getValueOffset; -/** @type {?} */ -ListVectorBase.prototype.getValueLength; - -var NestedVector = function() {}; -/** @type {?} */ -NestedVector.prototype.childData; -/** @type {?} */ -NestedVector.prototype.getChildAt; - -var NullVector = function() {}; -var BoolVector = function() {}; -/** @type {?} */ -BoolVector.from = function() {}; -/** @type {?} */ -BoolVector.prototype.values; -var IntVector = function() {}; -/** @type {?} */ -IntVector.from = function() {}; - -var FloatVector = function() {}; -/** @type {?} */ -FloatVector.from = function() {}; - -var DateVector = function() {}; -/** @type {?} */ -DateVector.from = function() {}; -/** @type {?} */ -DateVector.prototype.asEpochMilliseconds; -var DecimalVector = function() {}; -var TimeVector = function() {}; -var TimestampVector = function() {}; -/** @type {?} */ -TimestampVector.prototype.asEpochMilliseconds; -var IntervalVector = function() {}; -var BinaryVector = function() {}; -/** @type {?} */ -BinaryVector.prototype.asUtf8; -var FixedSizeBinaryVector = function() {}; -var Utf8Vector = function() {}; -/** @type {?} */ -Utf8Vector.prototype.asBinary; -var ListVector = function() {}; -/** @type {?} */ -ListVector.prototype.getChildAt; -var FixedSizeListVector = function() {}; -/** @type {?} */ -FixedSizeListVector.prototype.getChildAt; -var MapVector = function() {}; -/** @type {?} */ -MapVector.prototype.asStruct; -var StructVector = function() {}; -/** @type {?} */ -StructVector.prototype.asMap; -var UnionVector = function() {}; - -var DictionaryVector = function() {}; -/** @type {?} */ -DictionaryVector.prototype.indices; -/** @type {?} */ -DictionaryVector.prototype.dictionary; -/** @type {?} */ -DictionaryVector.prototype.getKey; -/** @type {?} */ -DictionaryVector.prototype.getValue; -/** @type {?} */ -DictionaryVector.prototype.reverseLookup; - -var FlatView = function() {}; -/** @type {?} */ -FlatView.prototype.get; -/** @type {?} */ -FlatView.prototype.clone; -/** @type {?} */ -FlatView.prototype.isValid; -/** @type {?} */ -FlatView.prototype.toArray; -/** @type {?} */ -FlatView.prototype.set; - -var PrimitiveView = function() {}; -/** @type {?} */ -PrimitiveView.prototype.size; -/** @type {?} */ -PrimitiveView.prototype.clone; - -var NullView = function() {}; -/** @type {?} */ -NullView.prototype.get; -/** @type {?} */ -NullView.prototype.clone; -/** @type {?} */ -NullView.prototype.isValid; -/** @type {?} */ -NullView.prototype.toArray; -/** @type {?} */ -NullView.prototype.set; - -var BoolView = function() {}; -/** @type {?} */ -BoolView.prototype.get; -/** @type {?} */ -BoolView.prototype.clone; -/** @type {?} */ -BoolView.prototype.isValid; -/** @type {?} */ -BoolView.prototype.toArray; -/** @type {?} */ -BoolView.prototype.set; - -var ValidityView = function() {}; -/** @type {?} */ -ValidityView.prototype.get; -/** @type {?} */ -ValidityView.prototype.clone; -/** @type {?} */ -ValidityView.prototype.isValid; -/** @type {?} */ -ValidityView.prototype.toArray; -/** @type {?} */ -ValidityView.prototype.set; -/** @type {?} */ -ValidityView.prototype.size; -/** @type {?} */ -ValidityView.prototype.getChildAt; - -var DictionaryView = function() {}; -/** @type {?} */ -DictionaryView.prototype.get; -/** @type {?} */ -DictionaryView.prototype.clone; -/** @type {?} */ -DictionaryView.prototype.isValid; -/** @type {?} */ -DictionaryView.prototype.toArray; -/** @type {?} */ -DictionaryView.prototype.set; - -var ListViewBase = function() {}; -/** @type {?} */ -ListViewBase.prototype.get; -/** @type {?} */ -ListViewBase.prototype.clone; -/** @type {?} */ -ListViewBase.prototype.isValid; -/** @type {?} */ -ListViewBase.prototype.toArray; -/** @type {?} */ -ListViewBase.prototype.set; - -var NestedView = function() {}; -/** @type {?} */ -NestedView.prototype.get; -/** @type {?} */ -NestedView.prototype.clone; -/** @type {?} */ -NestedView.prototype.isValid; -/** @type {?} */ -NestedView.prototype.toArray; -/** @type {?} */ -NestedView.prototype.set; - -var ChunkedView = function() {}; -/** @type {?} */ -ChunkedView.prototype.get; -/** @type {?} */ -ChunkedView.prototype.clone; -/** @type {?} */ -ChunkedView.prototype.isValid; -/** @type {?} */ -ChunkedView.prototype.toArray; -/** @type {?} */ -ChunkedView.prototype.set; - -var ListView = function() {}; -var FixedSizeListView = function() {}; -var BinaryView = function() {}; -var Utf8View = function() {}; -var UnionView = function() {}; -var DenseUnionView = function() {}; -var StructView = function() {}; -var MapView = function() {}; -var NullView = function() {}; -var FixedSizeView = function() {}; -var Float16View = function() {}; -var DateDayView = function() {}; -var DateMillisecondView = function() {}; -var TimestampDayView = function() {}; -var TimestampSecondView = function() {}; -var TimestampMillisecondView = function() {}; -var TimestampMicrosecondView = function() {}; -var TimestampNanosecondView = function() {}; -var IntervalYearMonthView = function() {}; -var IntervalYearView = function() {}; -var IntervalMonthView = function() {}; - -var TypeVisitor = function() {}; -/** @type {?} */ -TypeVisitor.visitTypeInline = function() {}; -/** @type {?} */ -TypeVisitor.prototype.visit; -/** @type {?} */ -TypeVisitor.prototype.visitMany; -/** @type {?} */ -TypeVisitor.prototype.visitNull; -/** @type {?} */ -TypeVisitor.prototype.visitBool; -/** @type {?} */ -TypeVisitor.prototype.visitInt; -/** @type {?} */ -TypeVisitor.prototype.visitFloat; -/** @type {?} */ -TypeVisitor.prototype.visitUtf8; -/** @type {?} */ -TypeVisitor.prototype.visitBinary; -/** @type {?} */ -TypeVisitor.prototype.visitFixedSizeBinary; -/** @type {?} */ -TypeVisitor.prototype.visitDate; -/** @type {?} */ -TypeVisitor.prototype.visitTimestamp; -/** @type {?} */ -TypeVisitor.prototype.visitTime; -/** @type {?} */ -TypeVisitor.prototype.visitDecimal; -/** @type {?} */ -TypeVisitor.prototype.visitList; -/** @type {?} */ -TypeVisitor.prototype.visitStruct; -/** @type {?} */ -TypeVisitor.prototype.visitUnion; -/** @type {?} */ -TypeVisitor.prototype.visitDictionary; -/** @type {?} */ -TypeVisitor.prototype.visitInterval; -/** @type {?} */ -TypeVisitor.prototype.visitFixedSizeList; -/** @type {?} */ -TypeVisitor.prototype.visitMap; - -var VectorVisitor = function() {}; -/** @type {?} */ -VectorVisitor.visitTypeInline = function() {}; -/** @type {?} */ -VectorVisitor.prototype.visit; -/** @type {?} */ -VectorVisitor.prototype.visitMany; -/** @type {?} */ -VectorVisitor.prototype.visitNull; -/** @type {?} */ -VectorVisitor.prototype.visitBool; -/** @type {?} */ -VectorVisitor.prototype.visitInt; -/** @type {?} */ -VectorVisitor.prototype.visitFloat; -/** @type {?} */ -VectorVisitor.prototype.visitUtf8; -/** @type {?} */ -VectorVisitor.prototype.visitBinary; -/** @type {?} */ -VectorVisitor.prototype.visitFixedSizeBinary; -/** @type {?} */ -VectorVisitor.prototype.visitDate; -/** @type {?} */ -VectorVisitor.prototype.visitTimestamp; -/** @type {?} */ -VectorVisitor.prototype.visitTime; -/** @type {?} */ -VectorVisitor.prototype.visitDecimal; -/** @type {?} */ -VectorVisitor.prototype.visitList; -/** @type {?} */ -VectorVisitor.prototype.visitStruct; -/** @type {?} */ -VectorVisitor.prototype.visitUnion; -/** @type {?} */ -VectorVisitor.prototype.visitDictionary; -/** @type {?} */ -VectorVisitor.prototype.visitInterval; -/** @type {?} */ -VectorVisitor.prototype.visitFixedSizeList; -/** @type {?} */ -VectorVisitor.prototype.visitMap; diff --git a/js/src/Arrow.node.ts b/js/src/Arrow.node.ts new file mode 100644 index 0000000000000..da6e3df6d9b08 --- /dev/null +++ b/js/src/Arrow.node.ts @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './io/adapters'; +import { RecordBatchReader } from './ipc/reader'; +import { RecordBatchWriter } from './ipc/writer'; +import { toNodeStream } from './ipc/node/iterable'; +import { recordBatchReaderThroughNodeStream } from './ipc/node/reader'; +import { recordBatchWriterThroughNodeStream } from './ipc/node/writer'; + +streamAdapters.toNodeStream = toNodeStream; +RecordBatchReader['throughNode'] = recordBatchReaderThroughNodeStream; +RecordBatchWriter['throughNode'] = recordBatchWriterThroughNodeStream; + +export * from './Arrow.dom'; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index c76578b62996d..691a8bb42b73c 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,306 +15,79 @@ // specific language governing permissions and limitations // under the License. -import * as type_ from './type'; -import * as data_ from './data'; -import * as vector_ from './vector'; +export { ArrowType, DateUnit, IntervalUnit, MessageHeader, MetadataVersion, Precision, TimeUnit, Type, UnionMode, VectorType } from './enum'; +export { Data } from './data'; +export { + DataType, + Null, + Bool, + Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, + Float, Float16, Float32, Float64, + Utf8, + Binary, + FixedSizeBinary, + Date_, DateDay, DateMillisecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Decimal, + List, + Struct, + Union, DenseUnion, SparseUnion, + Dictionary, + Interval, IntervalDayTime, IntervalYearMonth, + FixedSizeList, + Map_, +} from './type'; + +export { Table } from './table'; +export { Column } from './column'; +export { Schema, Field } from './schema'; +export { Visitor } from './visitor'; +export { + Row, + Vector, + BaseVector, + BinaryVector, + BoolVector, + Chunked, + DateVector, DateDayVector, DateMillisecondVector, + DecimalVector, + DictionaryVector, + FixedSizeBinaryVector, + FixedSizeListVector, + FloatVector, Float16Vector, Float32Vector, Float64Vector, + IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector, + IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, + ListVector, + MapVector, + NullVector, + StructVector, + TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector, + TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector, + UnionVector, DenseUnionVector, SparseUnionVector, + Utf8Vector, +} from './vector/index'; + +export { ByteStream, AsyncByteStream, AsyncByteQueue, ReadableSource, WritableSink } from './io/stream'; +export { RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader } from './ipc/reader'; +export { RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter } from './ipc/writer'; +export { MessageReader, AsyncMessageReader, JSONMessageReader } from './ipc/message'; +export { Message } from './ipc/metadata/message'; +export { RecordBatch } from './recordbatch'; +export { ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions } from './io/interfaces'; +export { DataFrame, FilteredDataFrame, CountByResult, BindFunc, NextFunc } from './compute/dataframe'; + import * as util_int_ from './util/int'; import * as util_bit_ from './util/bit'; -import * as util_node from './util/node'; -import * as visitor_ from './visitor'; -import * as view_ from './vector/view'; -import * as predicate_ from './predicate'; -import { Vector } from './vector'; -import { RecordBatch } from './recordbatch'; -import { Schema, Field, Type } from './type'; -import { Table, DataFrame, NextFunc, BindFunc, CountByResult } from './table'; -import { fromReadableStream } from './ipc/reader/node'; -import { read, readAsync, readStream } from './ipc/reader/arrow'; -import { readBuffersAsync, readRecordBatchesAsync } from './ipc/reader/arrow'; -import { serializeFile, serializeStream } from './ipc/writer/binary'; - -export import View = vector_.View; -export import VectorLike = vector_.VectorLike; -export import TypedArray = type_.TypedArray; -export import IntBitWidth = type_.IntBitWidth; -export import TimeBitWidth = type_.TimeBitWidth; -export import TypedArrayConstructor = type_.TypedArrayConstructor; - -export { fromReadableStream }; -export { read, readAsync, readStream }; -export { readBuffersAsync, readRecordBatchesAsync }; -export { serializeFile, serializeStream }; -export { Table, DataFrame, NextFunc, BindFunc, CountByResult }; -export { Field, Schema, RecordBatch, Vector, Type }; - -export namespace util { - export import Uint64 = util_int_.Uint64; - export import Int64 = util_int_.Int64; - export import Int128 = util_int_.Int128; - export import packBools = util_bit_.packBools; - export import PipeIterator = util_node.PipeIterator; - export import AsyncPipeIterator = util_node.AsyncPipeIterator; -} - -export namespace data { - export import BaseData = data_.BaseData; - export import FlatData = data_.FlatData; - export import BoolData = data_.BoolData; - export import FlatListData = data_.FlatListData; - export import DictionaryData = data_.DictionaryData; - export import NestedData = data_.NestedData; - export import ListData = data_.ListData; - export import UnionData = data_.UnionData; - export import SparseUnionData = data_.SparseUnionData; - export import DenseUnionData = data_.DenseUnionData; - export import ChunkedData = data_.ChunkedData; -} - -export namespace enum_ { - export import Type = type_.ArrowType; - export import DateUnit = type_.DateUnit; - export import TimeUnit = type_.TimeUnit; - export import Precision = type_.Precision; - export import UnionMode = type_.UnionMode; - export import VectorType = type_.VectorType; - export import IntervalUnit = type_.IntervalUnit; - export import MessageHeader = type_.MessageHeader; - export import MetadataVersion = type_.MetadataVersion; -} - -export namespace type { - export import Schema = type_.Schema; - export import Field = type_.Field; - export import Null = type_.Null; - export import Int = type_.Int; - export import Int8 = type_.Int8; - export import Int16 = type_.Int16; - export import Int32 = type_.Int32; - export import Int64 = type_.Int64; - export import Uint8 = type_.Uint8; - export import Uint16 = type_.Uint16; - export import Uint32 = type_.Uint32; - export import Uint64 = type_.Uint64; - export import Float = type_.Float; - export import Float16 = type_.Float16; - export import Float32 = type_.Float32; - export import Float64 = type_.Float64; - export import Binary = type_.Binary; - export import Utf8 = type_.Utf8; - export import Bool = type_.Bool; - export import Decimal = type_.Decimal; - export import Date_ = type_.Date_; - export import Time = type_.Time; - export import Timestamp = type_.Timestamp; - export import Interval = type_.Interval; - export import List = type_.List; - export import Struct = type_.Struct; - export import Union = type_.Union; - export import DenseUnion = type_.DenseUnion; - export import SparseUnion = type_.SparseUnion; - export import FixedSizeBinary = type_.FixedSizeBinary; - export import FixedSizeList = type_.FixedSizeList; - export import Map_ = type_.Map_; - export import Dictionary = type_.Dictionary; -} - -export namespace vector { - export import Vector = vector_.Vector; - export import NullVector = vector_.NullVector; - export import BoolVector = vector_.BoolVector; - export import IntVector = vector_.IntVector; - export import FloatVector = vector_.FloatVector; - export import DateVector = vector_.DateVector; - export import DecimalVector = vector_.DecimalVector; - export import TimeVector = vector_.TimeVector; - export import TimestampVector = vector_.TimestampVector; - export import IntervalVector = vector_.IntervalVector; - export import BinaryVector = vector_.BinaryVector; - export import FixedSizeBinaryVector = vector_.FixedSizeBinaryVector; - export import Utf8Vector = vector_.Utf8Vector; - export import ListVector = vector_.ListVector; - export import FixedSizeListVector = vector_.FixedSizeListVector; - export import MapVector = vector_.MapVector; - export import StructVector = vector_.StructVector; - export import UnionVector = vector_.UnionVector; - export import DictionaryVector = vector_.DictionaryVector; -} - -export namespace visitor { - export import TypeVisitor = visitor_.TypeVisitor; - export import VectorVisitor = visitor_.VectorVisitor; -} - -export namespace view { - export import ChunkedView = view_.ChunkedView; - export import DictionaryView = view_.DictionaryView; - export import ListView = view_.ListView; - export import FixedSizeListView = view_.FixedSizeListView; - export import BinaryView = view_.BinaryView; - export import Utf8View = view_.Utf8View; - export import UnionView = view_.UnionView; - export import DenseUnionView = view_.DenseUnionView; - export import NestedView = view_.NestedView; - export import StructView = view_.StructView; - export import MapView = view_.MapView; - export import FlatView = view_.FlatView; - export import NullView = view_.NullView; - export import BoolView = view_.BoolView; - export import ValidityView = view_.ValidityView; - export import PrimitiveView = view_.PrimitiveView; - export import FixedSizeView = view_.FixedSizeView; - export import Float16View = view_.Float16View; - export import DateDayView = view_.DateDayView; - export import DateMillisecondView = view_.DateMillisecondView; - export import TimestampDayView = view_.TimestampDayView; - export import TimestampSecondView = view_.TimestampSecondView; - export import TimestampMillisecondView = view_.TimestampMillisecondView; - export import TimestampMicrosecondView = view_.TimestampMicrosecondView; - export import TimestampNanosecondView = view_.TimestampNanosecondView; - export import IntervalYearMonthView = view_.IntervalYearMonthView; - export import IntervalYearView = view_.IntervalYearView; - export import IntervalMonthView = view_.IntervalMonthView; -} - -export namespace predicate { - export import col = predicate_.col; - export import lit = predicate_.lit; - export import and = predicate_.and; - export import or = predicate_.or; - export import custom = predicate_.custom; - - export import Or = predicate_.Or; - export import Col = predicate_.Col; - export import And = predicate_.And; - export import Not = predicate_.Not; - export import GTeq = predicate_.GTeq; - export import LTeq = predicate_.LTeq; - export import Value = predicate_.Value; - export import Equals = predicate_.Equals; - export import Literal = predicate_.Literal; - export import Predicate = predicate_.Predicate; - - export import PredicateFunc = predicate_.PredicateFunc; -} - -/* These exports are needed for the closure and uglify umd targets */ -try { - let Arrow: any = eval('exports'); - if (Arrow && typeof Arrow === 'object') { - // string indexers tell closure and uglify not to rename these properties - Arrow['data'] = data; - Arrow['type'] = type; - Arrow['util'] = util; - Arrow['view'] = view; - Arrow['enum_'] = enum_; - Arrow['vector'] = vector; - Arrow['visitor'] = visitor; - Arrow['predicate'] = predicate; - - Arrow['read'] = read; - Arrow['readAsync'] = readAsync; - Arrow['readStream'] = readStream; - Arrow['fromReadableStream'] = fromReadableStream; - Arrow['readBuffersAsync'] = readBuffersAsync; - Arrow['readRecordBatchesAsync'] = readRecordBatchesAsync; - - Arrow['serializeFile'] = serializeFile; - Arrow['serializeStream'] = serializeStream; - - Arrow['Type'] = Type; - Arrow['Field'] = Field; - Arrow['Schema'] = Schema; - Arrow['Vector'] = Vector; - Arrow['RecordBatch'] = RecordBatch; - - Arrow['Table'] = Table; - Arrow['CountByResult'] = CountByResult; - } -} catch (e) { /* not the UMD bundle */ } -/* end umd exports */ - -// closure compiler erases static properties/methods: -// https://github.com/google/closure-compiler/issues/1776 -// set them via string indexers to save them from the mangler -Schema['from'] = Schema.from; -Table['from'] = Table.from; -Table['fromVectors'] = Table.fromVectors; -Table['fromAsync'] = Table.fromAsync; -Table['fromStruct'] = Table.fromStruct; -Table['empty'] = Table.empty; -Vector['create'] = Vector.create; -RecordBatch['from'] = RecordBatch.from; - -util_int_.Uint64['add'] = util_int_.Uint64.add; -util_int_.Uint64['multiply'] = util_int_.Uint64.multiply; -util_int_.Uint64['from'] = util_int_.Uint64.from; -util_int_.Uint64['fromNumber'] = util_int_.Uint64.fromNumber; -util_int_.Uint64['fromString'] = util_int_.Uint64.fromString; -util_int_.Uint64['convertArray'] = util_int_.Uint64.convertArray; - -util_int_.Int64['add'] = util_int_.Int64.add; -util_int_.Int64['multiply'] = util_int_.Int64.multiply; -util_int_.Int64['from'] = util_int_.Int64.from; -util_int_.Int64['fromNumber'] = util_int_.Int64.fromNumber; -util_int_.Int64['fromString'] = util_int_.Int64.fromString; -util_int_.Int64['convertArray'] = util_int_.Int64.convertArray; - -util_int_.Int128['add'] = util_int_.Int128.add; -util_int_.Int128['multiply'] = util_int_.Int128.multiply; -util_int_.Int128['from'] = util_int_.Int128.from; -util_int_.Int128['fromNumber'] = util_int_.Int128.fromNumber; -util_int_.Int128['fromString'] = util_int_.Int128.fromString; -util_int_.Int128['convertArray'] = util_int_.Int128.convertArray; - -data_.ChunkedData['computeOffsets'] = data_.ChunkedData.computeOffsets; - -(type_.Type as any)['NONE'] = type_.Type.NONE; -(type_.Type as any)['Null'] = type_.Type.Null; -(type_.Type as any)['Int'] = type_.Type.Int; -(type_.Type as any)['Float'] = type_.Type.Float; -(type_.Type as any)['Binary'] = type_.Type.Binary; -(type_.Type as any)['Utf8'] = type_.Type.Utf8; -(type_.Type as any)['Bool'] = type_.Type.Bool; -(type_.Type as any)['Decimal'] = type_.Type.Decimal; -(type_.Type as any)['Date'] = type_.Type.Date; -(type_.Type as any)['Time'] = type_.Type.Time; -(type_.Type as any)['Timestamp'] = type_.Type.Timestamp; -(type_.Type as any)['Interval'] = type_.Type.Interval; -(type_.Type as any)['List'] = type_.Type.List; -(type_.Type as any)['Struct'] = type_.Type.Struct; -(type_.Type as any)['Union'] = type_.Type.Union; -(type_.Type as any)['FixedSizeBinary'] = type_.Type.FixedSizeBinary; -(type_.Type as any)['FixedSizeList'] = type_.Type.FixedSizeList; -(type_.Type as any)['Map'] = type_.Type.Map; -(type_.Type as any)['Dictionary'] = type_.Type.Dictionary; -(type_.Type as any)['DenseUnion'] = type_.Type.DenseUnion; -(type_.Type as any)['SparseUnion'] = type_.Type.SparseUnion; - -type_.DataType['isNull'] = type_.DataType.isNull; -type_.DataType['isInt'] = type_.DataType.isInt; -type_.DataType['isFloat'] = type_.DataType.isFloat; -type_.DataType['isBinary'] = type_.DataType.isBinary; -type_.DataType['isUtf8'] = type_.DataType.isUtf8; -type_.DataType['isBool'] = type_.DataType.isBool; -type_.DataType['isDecimal'] = type_.DataType.isDecimal; -type_.DataType['isDate'] = type_.DataType.isDate; -type_.DataType['isTime'] = type_.DataType.isTime; -type_.DataType['isTimestamp'] = type_.DataType.isTimestamp; -type_.DataType['isInterval'] = type_.DataType.isInterval; -type_.DataType['isList'] = type_.DataType.isList; -type_.DataType['isStruct'] = type_.DataType.isStruct; -type_.DataType['isUnion'] = type_.DataType.isUnion; -type_.DataType['isDenseUnion'] = type_.DataType.isDenseUnion; -type_.DataType['isSparseUnion'] = type_.DataType.isSparseUnion; -type_.DataType['isFixedSizeBinary'] = type_.DataType.isFixedSizeBinary; -type_.DataType['isFixedSizeList'] = type_.DataType.isFixedSizeList; -type_.DataType['isMap'] = type_.DataType.isMap; -type_.DataType['isDictionary'] = type_.DataType.isDictionary; - -vector_.BoolVector['from'] = vector_.BoolVector.from; -vector_.DateVector['from'] = vector_.DateVector.from; -vector_.IntVector['from'] = vector_.IntVector.from; -vector_.FloatVector['from'] = vector_.FloatVector.from; - -visitor_.TypeVisitor['visitTypeInline'] = visitor_.TypeVisitor.visitTypeInline; -visitor_.VectorVisitor['visitTypeInline'] = visitor_.VectorVisitor.visitTypeInline; \ No newline at end of file +import * as util_buffer_ from './util/buffer'; +import * as util_vector_ from './util/vector'; +import * as predicate from './compute/predicate'; + +export { predicate }; +/** @ignore */ +export const util = { + ...util_int_, + ...util_bit_, + ...util_buffer_, + ...util_vector_ +}; diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 510f00740fed0..4ae9c0089a009 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -20,60 +20,189 @@ /* tslint:disable */ import * as fs from 'fs'; -import { promisify } from 'util'; -import { Table, readStream } from '../Arrow'; +import * as stream from 'stream'; +import { valueToString } from '../util/pretty'; +import { RecordBatch, RecordBatchReader, AsyncByteQueue } from '../Arrow.node'; -const readFile = promisify(fs.readFile); -const { parse } = require('json-bignum'); +const padLeft = require('pad-left'); +const bignumJSONParse = require('json-bignum').parse; +const pipeline = require('util').promisify(stream.pipeline); const argv = require(`command-line-args`)(cliOpts(), { partial: true }); -const files = [...(argv.file || []), ...(argv._unknown || [])].filter(Boolean); +const files = argv.help ? [] : [...(argv.file || []), ...(argv._unknown || [])].filter(Boolean); + +const state = { ...argv, closed: false, hasRecords: false }; (async () => { - let hasRecords = false; - if (files.length > 0) { - hasRecords = true; - for (let input of files) { - printTable(await readFile(input)); - } - } else { - let rowOffset = 0; - let maxColumnWidths: number[] = []; - for await (const recordBatch of readStream(process.stdin)) { - hasRecords = true; - recordBatch.rowsToString(' | ', rowOffset, maxColumnWidths).pipe(process.stdout); - rowOffset += recordBatch.length; + + const sources = argv.help ? [] : [ + ...files.map((file) => () => fs.createReadStream(file)), + ...(process.stdin.isTTY ? [] : [() => process.stdin]) + ].filter(Boolean) as (() => NodeJS.ReadableStream)[]; + + let reader: RecordBatchReader | null; + + for (const source of sources) { + if (state.closed) { break; } + if (reader = await createRecordBatchReader(source)) { + await pipeline( + reader.toNodeStream(), + recordBatchRowsToString(state), + process.stdout + ).catch(() => state.closed = true); } + if (state.closed) { break; } } - return hasRecords ? null : print_usage(); -})().catch((e) => { console.error(e); process.exit(1); }); -function printTable(input: any) { - let table: Table; + return state.hasRecords ? 0 : print_usage(); +})() +.then((x) => +x || 0, (err) => { + if (err) { + console.error(`${err && err.stack || err}`); + } + return process.exitCode || 1; +}).then((code) => process.exit(code)); + +async function createRecordBatchReader(createSourceStream: () => NodeJS.ReadableStream) { + + let json = new AsyncByteQueue(); + let stream = new AsyncByteQueue(); + let source = createSourceStream(); + let reader: RecordBatchReader | null = null; + // tee the input source, just in case it's JSON + source.on('end', () => [stream, json].forEach((y) => y.close())) + .on('data', (x) => [stream, json].forEach((y) => y.write(x))) + .on('error', (e) => [stream, json].forEach((y) => y.abort(e))); + try { - table = Table.from(input); - } catch (e) { - table = Table.from(parse(input + '')); + reader = await (await RecordBatchReader.from(stream)).open(); + } catch (e) { reader = null; } + + if (!reader || reader.closed) { + reader = null; + await json.closed; + if (source instanceof fs.ReadStream) { source.close(); } + // If the data in the `json` ByteQueue parses to JSON, then assume it's Arrow JSON from a file or stdin + try { + reader = await (await RecordBatchReader.from(bignumJSONParse(await json.toString()))).open(); + } catch (e) { reader = null; } + } + + return (reader && !reader.closed) ? reader : null; +} + +function recordBatchRowsToString(state: { closed: boolean, schema: any, separator: string, hasRecords: boolean }) { + + let rowId = 0, maxColWidths = [15], separator = `${state.separator || ' |'} `; + + return new stream.Transform({ transform, encoding: 'utf8', writableObjectMode: true, readableObjectMode: false }); + + function transform(this: stream.Transform, batch: RecordBatch, _enc: string, cb: (error?: Error, data?: any) => void) { + batch = !(state.schema && state.schema.length) ? batch : batch.select(...state.schema); + if (batch.length <= 0 || batch.numCols <= 0 || state.closed) { + state.hasRecords || (state.hasRecords = false); + return cb(undefined, null); + } + + state.hasRecords = true; + const header = ['row_id', ...batch.schema.fields.map((f) => `${f}`)].map(valueToString); + + // Pass one to convert to strings and count max column widths + const newMaxWidths = measureColumnWidths(rowId, batch, header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length))); + + // If any of the column widths changed, print the header again + if ((rowId % 350) && JSON.stringify(newMaxWidths) !== JSON.stringify(maxColWidths)) { + this.push(`\n${formatRow(header, newMaxWidths, separator)}`); + } + + maxColWidths = newMaxWidths; + + for (const row of batch) { + if (state.closed) { break; } + else if (!row) { continue; } + if (!(rowId % 350)) { this.push(`\n${formatRow(header, maxColWidths, separator)}`); } + this.push(formatRow([rowId++, ...row].map(valueToString), maxColWidths, separator)); + } + cb(); } - if (argv.schema && argv.schema.length) { - table = table.select(...argv.schema); +} + +function formatRow(row: string[] = [], maxColWidths: number[] = [], separator: string = ' |') { + return row.map((x, j) => padLeft(x, maxColWidths[j])).join(separator) + '\n'; +} + +function measureColumnWidths(rowId: number, batch: RecordBatch, maxColWidths: number[] = []) { + for (const row of batch) { + if (!row) { continue; } + maxColWidths[0] = Math.max(maxColWidths[0] || 0, (`${rowId++}`).length); + for (let val: any, j = -1, k = row.length; ++j < k;) { + if (ArrayBuffer.isView(val = row[j]) && (typeof val[Symbol.toPrimitive] !== 'function')) { + // If we're printing a column of TypedArrays, ensure the column is wide enough to accommodate + // the widest possible element for a given byte size, since JS omits leading zeroes. For example: + // 1 | [1137743649,2170567488,244696391,2122556476] + // 2 | null + // 3 | [637174007,2142281880,961736230,2912449282] + // 4 | [1035112265,21832886,412842672,2207710517] + // 5 | null + // 6 | null + // 7 | [2755142991,4192423256,2994359,467878370] + const elementWidth = typedArrayElementWidths.get(val.constructor)!; + + maxColWidths[j + 1] = Math.max(maxColWidths[j + 1] || 0, + 2 + // brackets on each end + (val.length - 1) + // commas between elements + (val.length * elementWidth) // width of stringified 2^N-1 + ); + } else { + maxColWidths[j + 1] = Math.max(maxColWidths[j + 1] || 0, valueToString(val).length); + } + } } - table.rowsToString().pipe(process.stdout); + return maxColWidths; } +// Measure the stringified representation of 2^N-1 for each TypedArray variant +const typedArrayElementWidths = (() => { + const maxElementWidth = (ArrayType: any) => { + const octets = Array.from({ length: ArrayType.BYTES_PER_ELEMENT - 1 }, _ => 255); + return `${new ArrayType(new Uint8Array([...octets, 254]).buffer)[0]}`.length; + }; + return new Map([ + [Int8Array, maxElementWidth(Int8Array)], + [Int16Array, maxElementWidth(Int16Array)], + [Int32Array, maxElementWidth(Int32Array)], + [Uint8Array, maxElementWidth(Uint8Array)], + [Uint16Array, maxElementWidth(Uint16Array)], + [Uint32Array, maxElementWidth(Uint32Array)], + [Float32Array, maxElementWidth(Float32Array)], + [Float64Array, maxElementWidth(Float64Array)], + [Uint8ClampedArray, maxElementWidth(Uint8ClampedArray)] + ]) +})(); + function cliOpts() { return [ { type: String, name: 'schema', alias: 's', optional: true, multiple: true, - typeLabel: '[underline]{columns}', + typeLabel: '{underline columns}', description: 'A space-delimited list of column names' }, { type: String, name: 'file', alias: 'f', - optional: false, multiple: true, + optional: true, multiple: true, description: 'The Arrow file to read' + }, + { + type: String, + name: 'sep', optional: true, default: '|', + description: 'The column separator character' + }, + { + type: Boolean, + name: 'help', optional: true, default: false, + description: 'Print this usage guide.' } ]; } @@ -87,34 +216,29 @@ function print_usage() { { header: 'Synopsis', content: [ - '$ arrow2csv [underline]{file.arrow} [[bold]{--schema} column_name ...]', - '$ arrow2csv [[bold]{--schema} column_name ...] [[bold]{--file} [underline]{file.arrow}]', - '$ arrow2csv [bold]{-s} column_1 [bold]{-s} column_2 [[bold]{-f} [underline]{file.arrow}]', - '$ arrow2csv [[bold]{--help}]' + '$ arrow2csv {underline file.arrow} [{bold --schema} column_name ...]', + '$ arrow2csv [{bold --schema} column_name ...] [{bold --file} {underline file.arrow}]', + '$ arrow2csv {bold -s} column_1 {bold -s} column_2 [{bold -f} {underline file.arrow}]', + '$ arrow2csv [{bold --help}]' ] }, { header: 'Options', - optionList: [ - ...cliOpts(), - { - name: 'help', - description: 'Print this usage guide.' - } - ] + optionList: cliOpts() }, { header: 'Example', content: [ - '$ arrow2csv --schema foo baz -f simple.arrow', - '> foo, baz', - '> 1, aa', - '> null, null', - '> 3, null', - '> 4, bbb', - '> 5, cccc', + '$ arrow2csv --schema foo baz -f simple.arrow --sep ","', + ' ', + '> "row_id", "foo: Int32", "bar: Float64", "baz: Utf8"', + '> 0, 1, 1, "aa"', + '> 1, null, null, null', + '> 2, 3, null, null', + '> 3, 4, 4, "bbb"', + '> 4, 5, 5, "cccc"', ] } ])); - process.exit(1); -} \ No newline at end of file + return 1; +} diff --git a/js/src/column.ts b/js/src/column.ts new file mode 100644 index 0000000000000..0a5bc36797bf9 --- /dev/null +++ b/js/src/column.ts @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from './schema'; +import { Vector } from './vector'; +import { DataType } from './type'; +import { Clonable, Sliceable, Applicative } from './vector'; +import { Chunked, SearchContinuation } from './vector/chunked'; + +export interface Column { + typeId: T['TType']; + concat(...others: Vector[]): Column; + slice(begin?: number, end?: number): Column; + clone(chunks?: Vector[], offsets?: Uint32Array): Column; +} + +export class Column + extends Chunked + implements Clonable>, + Sliceable>, + Applicative> { + + constructor(field: Field, vectors: Vector[] = [], offsets?: Uint32Array) { + vectors = Chunked.flatten(...vectors); + super(field.type, vectors, offsets); + this._field = field; + if (vectors.length === 1 && !(this instanceof SingleChunkColumn)) { + return new SingleChunkColumn(field, vectors[0], this._chunkOffsets); + } + } + + protected _field: Field; + protected _children?: Column[]; + + public get field() { return this._field; } + public get name() { return this._field.name; } + + public clone(chunks = this._chunks) { + return new Column(this._field, chunks); + } + + public getChildAt(index: number): Column | null { + + if (index < 0 || index >= this.numChildren) { return null; } + + let columns = this._children || (this._children = []); + let column: Column, field: Field, chunks: Vector[]; + + if (column = columns[index]) { return column; } + if (field = ((this.type.children || [])[index] as Field)) { + chunks = this._chunks + .map((vector) => vector.getChildAt(index)) + .filter((vec): vec is Vector => vec != null); + if (chunks.length > 0) { + return (columns[index] = new Column(field, chunks)); + } + } + + return null; + } +} + +class SingleChunkColumn extends Column { + protected _chunk: Vector; + constructor(field: Field, vector: Vector, offsets?: Uint32Array) { + super(field, [vector], offsets); + this._chunk = vector; + } + public search(index: number): [number, number] | null; + public search>>(index: number, then?: N): ReturnType; + public search>>(index: number, then?: N) { + return then ? then(this, 0, index) : [0, index]; + } + public isValid(index: number): boolean { + return this._chunk.isValid(index); + } + public get(index: number): T['TValue'] | null { + return this._chunk.get(index); + } + public set(index: number, value: T['TValue'] | null): void { + this._chunk.set(index, value); + } + public indexOf(element: T['TValue'], offset?: number): number { + return this._chunk.indexOf(element, offset); + } +} diff --git a/js/src/compute/dataframe.ts b/js/src/compute/dataframe.ts new file mode 100644 index 0000000000000..01026d882f0c0 --- /dev/null +++ b/js/src/compute/dataframe.ts @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Table } from '../table'; +import { Vector } from '../vector'; +import { IntVector } from '../vector/int'; +import { Field, Schema } from '../schema'; +import { Vector as V } from '../interfaces'; +import { Predicate, Col } from './predicate'; +import { RecordBatch } from '../recordbatch'; +import { DataType, Int, Struct, Dictionary } from '../type'; + +/** @ignore */ +export type BindFunc = (batch: RecordBatch) => void; +/** @ignore */ +export type NextFunc = (idx: number, batch: RecordBatch) => void; + +Table.prototype.countBy = function(this: Table, name: Col | string) { return new DataFrame(this.chunks).countBy(name); }; +Table.prototype.scan = function(this: Table, next: NextFunc, bind?: BindFunc) { return new DataFrame(this.chunks).scan(next, bind); }; +Table.prototype.filter = function(this: Table, predicate: Predicate): FilteredDataFrame { return new DataFrame(this.chunks).filter(predicate); }; + +export class DataFrame extends Table { + public filter(predicate: Predicate): FilteredDataFrame { + return new FilteredDataFrame(this.chunks, predicate); + } + public scan(next: NextFunc, bind?: BindFunc) { + const batches = this.chunks, numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + if (bind) { bind(batch); } + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + next(index, batch); + } + } + } + public countBy(name: Col | string) { + const batches = this.chunks, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name as Col; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as V; + if (!DataType.isDictionary(vector.type)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + + const countByteLength = Math.ceil(Math.log(vector.dictionary.length) / Math.log(256)); + const CountsArrayType = countByteLength == 4 ? Uint32Array : + countByteLength >= 2 ? Uint16Array : Uint8Array; + + const counts = new CountsArrayType(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as V).indices; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} + +export class CountByResult extends Table<{ values: T, counts: TCount }> { + constructor(values: Vector, counts: V) { + const schema = new Schema<{ values: T, counts: TCount }>([ + new Field('values', values.type), + new Field('counts', counts.type) + ]); + super(new RecordBatch(schema, counts.length, [values, counts])); + } + public toJSON(): Object { + const values = this.getColumnAt(0)!; + const counts = this.getColumnAt(1)!; + const result = {} as { [k: string]: number | null }; + for (let i = -1; ++i < this.length;) { + result[values.get(i)] = counts.get(i); + } + return result; + } +} + +export class FilteredDataFrame extends DataFrame { + private _predicate: Predicate; + constructor (batches: RecordBatch[], predicate: Predicate) { + super(batches); + this._predicate = predicate; + } + public scan(next: NextFunc, bind?: BindFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + if (bind) { bind(batch); } + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { next(index, batch); } + } + } + } + public count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { ++sum; } + } + } + return sum; + } + public *[Symbol.iterator](): IterableIterator['TValue']> { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { yield batch.get(index) as any; } + } + } + } + public filter(predicate: Predicate): FilteredDataFrame { + return new FilteredDataFrame( + this._chunks, + this._predicate.and(predicate) + ); + } + public countBy(name: Col | string) { + const batches = this._chunks, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name as Col; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as V; + if (!DataType.isDictionary(vector.type)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + + const countByteLength = Math.ceil(Math.log(vector.dictionary.length) / Math.log(256)); + const CountsArrayType = countByteLength == 4 ? Uint32Array : + countByteLength >= 2 ? Uint16Array : Uint8Array; + + const counts = new CountsArrayType(vector.dictionary.length); + + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as V).indices; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null && predicate(index, batch)) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} diff --git a/js/src/predicate.ts b/js/src/compute/predicate.ts similarity index 94% rename from js/src/predicate.ts rename to js/src/compute/predicate.ts index cfae73ae0af73..ec947d2670c81 100644 --- a/js/src/predicate.ts +++ b/js/src/compute/predicate.ts @@ -15,12 +15,16 @@ // specific language governing permissions and limitations // under the License. -import { RecordBatch } from './recordbatch'; -import { Vector, DictionaryVector } from './vector'; +import { Vector } from '../vector'; +import { RecordBatch } from '../recordbatch'; +import { DictionaryVector } from '../vector/dictionary'; +/** @ignore */ export type ValueFunc = (idx: number, cols: RecordBatch) => T | null; +/** @ignore */ export type PredicateFunc = (idx: number, cols: RecordBatch) => boolean; +/** @ignore */ export abstract class Value { eq(other: Value | T): Predicate { if (!(other instanceof Value)) { other = new Literal(other); } @@ -45,10 +49,12 @@ export abstract class Value { } } +/** @ignore */ export class Literal extends Value { constructor(public v: T) { super(); } } +/** @ignore */ export class Col extends Value { // @ts-ignore public vector: Vector; @@ -56,7 +62,7 @@ export class Col extends Value { public colidx: number; constructor(public name: string) { super(); } - bind(batch: RecordBatch) { + bind(batch: RecordBatch): (idx: number, batch?: RecordBatch) => any { if (!this.colidx) { // Assume column index doesn't change between calls to bind //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); @@ -70,11 +76,13 @@ export class Col extends Value { } if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } } - this.vector = batch.getChildAt(this.colidx)!; - return this.vector.get.bind(this.vector); + + const vec = this.vector = batch.getChildAt(this.colidx)!; + return (idx: number) => vec.get(idx); } } +/** @ignore */ export abstract class Predicate { abstract bind(batch: RecordBatch): PredicateFunc; and(...expr: Predicate[]): And { return new And(this, ...expr); } @@ -82,6 +90,7 @@ export abstract class Predicate { not(): Predicate { return new Not(this); } } +/** @ignore */ export abstract class ComparisonPredicate extends Predicate { constructor(public readonly left: Value, public readonly right: Value) { super(); @@ -110,8 +119,9 @@ export abstract class ComparisonPredicate extends Predicate { protected abstract _bindLitCol(batch: RecordBatch, lit: Literal, col: Col): PredicateFunc; } +/** @ignore */ export abstract class CombinationPredicate extends Predicate { - readonly children: Predicate[] + readonly children: Predicate[]; constructor(...children: Predicate[]) { super(); this.children = children; @@ -120,12 +130,13 @@ export abstract class CombinationPredicate extends Predicate { // add children to protoype so it doesn't get mangled in es2015/umd ( CombinationPredicate.prototype).children = Object.freeze([]); // freeze for safety +/** @ignore */ export class And extends CombinationPredicate { constructor(...children: Predicate[]) { // Flatten any Ands children = children.reduce((accum: Predicate[], p: Predicate): Predicate[] => { - return accum.concat(p instanceof And ? p.children : p) - }, []) + return accum.concat(p instanceof And ? p.children : p); + }, []); super(...children); } bind(batch: RecordBatch) { @@ -134,12 +145,13 @@ export class And extends CombinationPredicate { } } +/** @ignore */ export class Or extends CombinationPredicate { constructor(...children: Predicate[]) { // Flatten any Ors children = children.reduce((accum: Predicate[], p: Predicate): Predicate[] => { - return accum.concat(p instanceof Or ? p.children : p) - }, []) + return accum.concat(p instanceof Or ? p.children : p); + }, []); super(...children); } bind(batch: RecordBatch) { @@ -148,6 +160,7 @@ export class Or extends CombinationPredicate { } } +/** @ignore */ export class Equals extends ComparisonPredicate { // Helpers used to cache dictionary reverse lookups between calls to bind private lastDictionary: Vector|undefined; @@ -200,6 +213,7 @@ export class Equals extends ComparisonPredicate { } } +/** @ignore */ export class LTeq extends ComparisonPredicate { protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v <= right.v; @@ -223,6 +237,7 @@ export class LTeq extends ComparisonPredicate { } } +/** @ignore */ export class GTeq extends ComparisonPredicate { protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v >= right.v; @@ -246,6 +261,7 @@ export class GTeq extends ComparisonPredicate { } } +/** @ignore */ export class Not extends Predicate { constructor(public readonly child: Predicate) { super(); @@ -257,6 +273,7 @@ export class Not extends Predicate { } } +/** @ignore */ export class CustomPredicate extends Predicate { constructor(private next: PredicateFunc, private bind_: (batch: RecordBatch) => void) { super(); diff --git a/js/src/data.ts b/js/src/data.ts index 5a117594bc89e..b55321bf98ec2 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -15,317 +15,231 @@ // specific language governing permissions and limitations // under the License. +import { Vector } from './vector'; import { popcnt_bit_range } from './util/bit'; -import { VectorLike, Vector } from './vector'; -import { Int, Bool, FlatListType, List, Struct, Map_ } from './type'; -import { VectorType, TypedArray, TypedArrayConstructor, Dictionary } from './type'; -import { DataType, FlatType, ListType, NestedType, SingleNestedType, DenseUnion, SparseUnion } from './type'; +import { toArrayBufferView } from './util/buffer'; +import { DataType, SparseUnion, DenseUnion } from './type'; +import { VectorType as BufferType, UnionMode, Type } from './enum'; +import { + Dictionary, + Null, Int, Float, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from './type'; -export function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { - if (!ArrayType && ArrayBuffer.isView(values)) { return values; } - return values instanceof ArrayType ? values - : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) - : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); -} - -export type Data = DataTypes[T['TType']] & BaseData; -export interface DataTypes { -/* [Type.NONE]*/ 0: BaseData; -/* [Type.Null]*/ 1: FlatData; -/* [Type.Int]*/ 2: FlatData; -/* [Type.Float]*/ 3: FlatData; -/* [Type.Binary]*/ 4: FlatListData; -/* [Type.Utf8]*/ 5: FlatListData; -/* [Type.Bool]*/ 6: BoolData; -/* [Type.Decimal]*/ 7: FlatData; -/* [Type.Date]*/ 8: FlatData; -/* [Type.Time]*/ 9: FlatData; -/* [Type.Timestamp]*/ 10: FlatData; -/* [Type.Interval]*/ 11: FlatData; -/* [Type.List]*/ 12: ListData>; -/* [Type.Struct]*/ 13: NestedData; -/* [Type.Union]*/ 14: UnionData; -/* [Type.FixedSizeBinary]*/ 15: FlatData; -/* [Type.FixedSizeList]*/ 16: SingleNestedData; -/* [Type.Map]*/ 17: NestedData; -/* [Type.DenseUnion]*/ DenseUnion: DenseUnionData; -/*[Type.SparseUnion]*/ SparseUnion: SparseUnionData; -/*[ Type.Dictionary]*/ Dictionary: DictionaryData; -} // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count -// to -1 (any negative number will do). When Array::null_count is called the +// to -1 (any negative number will do). When Vector.nullCount is called the // first time, the null count will be computed. See ARROW-33 -export type kUnknownNullCount = -1; -export const kUnknownNullCount = -1; +/** @ignore */ export type kUnknownNullCount = -1; +/** @ignore */ export const kUnknownNullCount = -1; -export class BaseData implements VectorLike { - public type: T; - public length: number; - public offset: number; - // @ts-ignore - public childData: Data[]; - protected _nullCount: number | kUnknownNullCount; - protected /* [VectorType.OFFSET]:*/ 0?: Int32Array; - protected /* [VectorType.DATA]:*/ 1?: T['TArray']; - protected /*[VectorType.VALIDITY]:*/ 2?: Uint8Array; - protected /* [VectorType.TYPE]:*/ 3?: Int8Array; - constructor(type: T, length: number, offset?: number, nullCount?: number) { - this.type = type; - this.length = Math.floor(Math.max(length || 0, 0)); - this.offset = Math.floor(Math.max(offset || 0, 0)); - this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); - } - public get typeId() { return this.type.TType; } - public get nullBitmap() { return this[VectorType.VALIDITY]; } - public get nullCount() { - let nullCount = this._nullCount; - let nullBitmap: Uint8Array | undefined; - if (nullCount === -1 && (nullBitmap = this[VectorType.VALIDITY])) { - this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); - } - return nullCount; - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new BaseData(type, length, offset, nullCount) as any; - } - public slice(offset: number, length: number) { - return length <= 0 ? this : this.sliceInternal(this.clone( - this.type, length, this.offset + offset, +(this._nullCount === 0) - 1 - ) as any, offset, length); - } - protected sliceInternal(clone: this, offset: number, length: number) { - let arr: any; - // If typeIds exist, slice the typeIds buffer - (arr = this[VectorType.TYPE]) && (clone[VectorType.TYPE] = this.sliceData(arr, offset, length)); - // If offsets exist, only slice the offsets buffer - (arr = this[VectorType.OFFSET]) && (clone[VectorType.OFFSET] = this.sliceOffsets(arr, offset, length)) || - // Otherwise if no offsets, slice the data buffer - (arr = this[VectorType.DATA]) && (clone[VectorType.DATA] = this.sliceData(arr, offset, length)); - return clone; - } - protected sliceData(data: T['TArray'] & TypedArray, offset: number, length: number) { - return data.subarray(offset, offset + length); - } - protected sliceOffsets(valueOffsets: Int32Array, offset: number, length: number) { - return valueOffsets.subarray(offset, offset + length + 1); - } -} +/** @ignore */ export type NullBuffer = Uint8Array | null | undefined; +/** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable; +/** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable; +/** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable; -export class FlatData extends BaseData { - public /* [VectorType.DATA]:*/ 1: T['TArray']; - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - public get values() { return this[VectorType.DATA]; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, data: Iterable, offset?: number, nullCount?: number) { - super(type, length, offset, nullCount); - this[VectorType.DATA] = toTypedArray(this.ArrayType, data); - this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); - } - public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { - return new (this.constructor as any)(type, length, this[VectorType.VALIDITY], this[VectorType.DATA], offset, nullCount) as FlatData; - } +/** @ignore */ +export interface Buffers { + [BufferType.OFFSET]: Int32Array; + [BufferType.DATA]: T['TArray']; + [BufferType.VALIDITY]: Uint8Array; + [BufferType.TYPE]: T['TArray']; } -export class BoolData extends FlatData { - protected sliceData(data: Uint8Array) { return data; } +/** @ignore */ +export interface Data { + readonly TType: T['TType']; + readonly TArray: T['TArray']; + readonly TValue: T['TValue']; } -export class FlatListData extends FlatData { - public /* [VectorType.OFFSET]:*/ 0: Int32Array; - public /* [VectorType.DATA]:*/ 1: T['TArray']; - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - public get values() { return this[VectorType.DATA]; } - public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, data: T['TArray'], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, data, offset, nullCount); - this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { - return new FlatListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this[VectorType.DATA], offset, nullCount) as FlatListData; - } -} +/** @ignore */ +export class Data { -export class DictionaryData extends BaseData> { - protected _dictionary: Vector; - protected _indices: Data>; - public get indices() { return this._indices; } - public get dictionary() { return this._dictionary; } - constructor(type: Dictionary, dictionary: Vector, indices: Data>) { - super(type, indices.length, indices.offset, (indices as any)._nullCount); - this._indices = indices; - this._dictionary = dictionary; - } - public get nullCount() { return this._indices.nullCount; } - public get nullBitmap() { return this._indices.nullBitmap; } - public clone>(type: R, length = this.length, offset = this.offset) { - const data = this._dictionary.data.clone(type.dictionary as any); - return new DictionaryData( - this.type as any, - this._dictionary.clone(data) as any, - this._indices.slice(offset - this.offset, length) - ) as any; - } - protected sliceInternal(clone: this, _offset: number, _length: number) { - clone.length = clone._indices.length; - clone._nullCount = (clone._indices as any)._nullCount; - return clone; - } -} + public readonly type: T; + public readonly length: number; + public readonly offset: number; + public readonly stride: number; + public readonly childData: Data[]; + public readonly values: Buffers[BufferType.DATA]; + public readonly typeIds: Buffers[BufferType.TYPE]; + // @ts-ignore + public readonly nullBitmap: Buffers[BufferType.VALIDITY]; + // @ts-ignore + public readonly valueOffsets: Buffers[BufferType.OFFSET]; -export class NestedData extends BaseData { - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, offset, nullCount); - this.childData = childData; - this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new NestedData(type, length, this[VectorType.VALIDITY], this.childData, offset, nullCount) as any; + public get ArrayType() { return this.type.ArrayType; } + public get typeId(): T['TType'] { return this.type.typeId; } + public get buffers() { + return [this.valueOffsets, this.values, this.nullBitmap, this.typeIds] as Buffers; } - protected sliceInternal(clone: this, offset: number, length: number) { - if (!this[VectorType.OFFSET]) { - clone.childData = this.childData.map((child) => child.slice(offset, length)); + + protected _nullCount: number | kUnknownNullCount; + + public get nullCount() { + let nullCount = this._nullCount; + let nullBitmap: Uint8Array | undefined; + if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { + this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); } - return super.sliceInternal(clone, offset, length); + return nullCount; } -} -export class SingleNestedData extends NestedData { - protected _valuesData: Data; - public get values() { return this._valuesData; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueChildData: Data, offset?: number, nullCount?: number) { - super(type, length, nullBitmap, [valueChildData], offset, nullCount); - this._valuesData = valueChildData; + constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial> | Data, childData?: (Data | Vector)[]) { + this.type = type; + this.offset = Math.floor(Math.max(offset || 0, 0)); + this.length = Math.floor(Math.max(length || 0, 0)); + this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); + this.childData = (childData || []).map((x) => x instanceof Data ? x : x.data) as Data[]; + let buffer: Buffers[keyof Buffers]; + if (buffers instanceof Data) { + this.stride = buffers.stride; + this.values = buffers.values; + this.typeIds = buffers.typeIds; + this.nullBitmap = buffers.nullBitmap; + this.valueOffsets = buffers.valueOffsets; + } else { + if (buffers) { + (buffer = (buffers as Buffers)[0]) && (this.valueOffsets = buffer); + (buffer = (buffers as Buffers)[1]) && (this.values = buffer); + (buffer = (buffers as Buffers)[2]) && (this.nullBitmap = buffer); + (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); + } + const t: any = type; + switch (type.typeId) { + case Type.Decimal: this.stride = 4; break; + case Type.Timestamp: this.stride = 2; break; + case Type.Date: this.stride = 1 + (t as Date_).unit; break; + case Type.Interval: this.stride = 1 + (t as Interval).unit; break; + case Type.Int: this.stride = 1 + +((t as Int).bitWidth > 32); break; + case Type.Time: this.stride = 1 + +((t as Time).bitWidth > 32); break; + case Type.FixedSizeList: this.stride = (t as FixedSizeList).listSize; break; + case Type.FixedSizeBinary: this.stride = (t as FixedSizeBinary).byteWidth; break; + default: this.stride = 1; + } + } } -} -export class ListData extends SingleNestedData { - public /* [VectorType.OFFSET]:*/ 0: Int32Array; - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, valueChildData: Data, offset?: number, nullCount?: number) { - super(type, length, nullBitmap, valueChildData, offset, nullCount); - this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + public clone(type: R, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers = this, childData: (Data | Vector)[] = this.childData) { + return new Data(type, offset, length, nullCount, buffers, childData); } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new ListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this._valuesData as any, offset, nullCount) as any; - } -} -export class UnionData extends NestedData { - public /* [VectorType.TYPE]:*/ 3: T['TArray']; - public get typeIds() { return this[VectorType.TYPE]; } - public readonly typeIdToChildIndex: { [key: number]: number }; - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, childData, offset, nullCount); - this[VectorType.TYPE] = toTypedArray(Int8Array, typeIds); - this.typeIdToChildIndex = type.typeIds.reduce((typeIdToChildIndex, typeId, idx) => { - return (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex; - }, Object.create(null) as { [key: number]: number }); + public slice(offset: number, length: number): Data { + // +true === 1, +false === 0, so this means + // we keep nullCount at 0 if it's already 0, + // otherwise set to the invalidated flag -1 + const { stride, typeId, childData } = this; + const nullCount = +(this._nullCount === 0) - 1; + const childStride = typeId === 16 /* FixedSizeList */ ? stride : 1; + const buffers = this._sliceBuffers(offset, length, stride, typeId); + return this.clone(this.type, this.offset + offset, length, nullCount, buffers, + // Don't slice children if we have value offsets (the variable-width types) + (!childData.length || this.valueOffsets) ? childData : this._sliceChildren(childData, childStride * offset, childStride * length)); } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new UnionData(type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this.childData, offset, nullCount) as any; - } -} -export class SparseUnionData extends UnionData { - constructor(type: SparseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, typeIds, childData, offset, nullCount); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new SparseUnionData( - type, - length, - this[VectorType.VALIDITY], - this[VectorType.TYPE], - this.childData, - offset, nullCount - ) as any; + protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers { + let arr: any, { buffers } = this; + // If typeIds exist, slice the typeIds buffer + (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + return buffers; } -} -export class DenseUnionData extends UnionData { - public /* [VectorType.OFFSET]:*/ 0: Int32Array; - public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: DenseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, valueOffsets: Iterable, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, typeIds, childData, offset, nullCount); - this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new DenseUnionData( - type, - length, - this[VectorType.VALIDITY], - this[VectorType.TYPE], - this[VectorType.OFFSET], - this.childData, - offset, nullCount - ) as any; + protected _sliceChildren(childData: Data[], offset: number, length: number): Data[] { + return childData.map((child) => child.slice(offset, length)); } -} -export class ChunkedData extends BaseData { - // @ts-ignore - protected _chunkData: Data[]; - protected _chunkVectors: Vector[]; - protected _chunkOffsets: Uint32Array; - public get chunkVectors() { return this._chunkVectors; } - public get chunkOffsets() { return this._chunkOffsets; } - public get chunkData() { - return this._chunkData || ( - this._chunkData = this._chunkVectors.map(({ data }) => data)); - } - constructor(type: T, length: number, chunkVectors: Vector[], offset?: number, nullCount?: number, chunkOffsets?: Uint32Array) { - super(type, length, offset, nullCount); - this._chunkVectors = chunkVectors; - this._chunkOffsets = chunkOffsets || ChunkedData.computeOffsets(chunkVectors); - } - public get nullCount() { - let nullCount = this._nullCount; - if (nullCount === -1) { - this._nullCount = nullCount = this._chunkVectors.reduce((x, c) => x + c.nullCount, 0); + // + // Convenience methods for creating Data instances for each of the Arrow Vector types + // + /** @nocollapse */ + public static Null(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Int(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Dictionary(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.indices.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Float(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Bool(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Decimal(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Date(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Time(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Timestamp(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Interval(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static FixedSizeBinary(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Binary(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, data: Uint8Array) { + return new Data(type, offset, length, nullCount, [toArrayBufferView(Int32Array, valueOffsets), toArrayBufferView(Uint8Array, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Utf8(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, data: Uint8Array) { + return new Data(type, offset, length, nullCount, [toArrayBufferView(Int32Array, valueOffsets), toArrayBufferView(Uint8Array, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static List(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, child: Data | Vector) { + return new Data(type, offset, length, nullCount, [toArrayBufferView(Int32Array, valueOffsets), undefined, toArrayBufferView(Uint8Array, nullBitmap)], [child]); + } + /** @nocollapse */ + public static FixedSizeList(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, child: Data | Vector) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)], [child]); + } + /** @nocollapse */ + public static Struct(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, children: (Data | Vector)[]) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)], children); + } + /** @nocollapse */ + public static Map(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, children: (Data | Vector)[]) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)], children); + } + public static Union(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, children: (Data | Vector)[]): Data; + public static Union(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsets: ValueOffsetsBuffer, children: (Data | Vector)[]): Data; + /** @nocollapse */ + public static Union(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsetsOrChildren: ValueOffsetsBuffer | (Data | Vector)[], children?: (Data | Vector)[]) { + const buffers = [ + undefined, undefined, + toArrayBufferView(Uint8Array, nullBitmap), + toArrayBufferView(type.ArrayType, typeIds) + ] as Partial>; + if (type.mode === UnionMode.Sparse) { + return new Data(type, offset, length, nullCount, buffers, valueOffsetsOrChildren as (Data | Vector)[]); } - return nullCount; - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new ChunkedData( - type, length, - this._chunkVectors.map((vec) => vec.clone(vec.data.clone(type))) as any, - offset, nullCount, this._chunkOffsets - ) as any; - } - protected sliceInternal(clone: this, offset: number, length: number) { - const chunks = this._chunkVectors; - const offsets = this._chunkOffsets; - const chunkSlices: Vector[] = []; - for (let childIndex = -1, numChildren = chunks.length; ++childIndex < numChildren;) { - const child = chunks[childIndex]; - const childLength = child.length; - const childOffset = offsets[childIndex]; - // If the child is to the right of the slice boundary, exclude - if (childOffset >= offset + length) { continue; } - // If the child is to the left of of the slice boundary, exclude - if (offset >= childOffset + childLength) { continue; } - // If the child is between both left and right boundaries, include w/o slicing - if (childOffset >= offset && (childOffset + childLength) <= offset + length) { - chunkSlices.push(child); - continue; - } - // If the child overlaps one of the slice boundaries, include that slice - const begin = Math.max(0, offset - childOffset); - const end = begin + Math.min(childLength - begin, (offset + length) - childOffset); - chunkSlices.push(child.slice(begin, end)); - } - clone._chunkVectors = chunkSlices; - clone._chunkOffsets = ChunkedData.computeOffsets(chunkSlices); - return clone; - } - static computeOffsets(childVectors: Vector[]) { - const childOffsets = new Uint32Array(childVectors.length + 1); - for (let index = 0, length = childOffsets.length, childOffset = childOffsets[0] = 0; ++index < length;) { - childOffsets[index] = (childOffset += childVectors[index - 1].length); - } - return childOffsets; + buffers[BufferType.OFFSET] = toArrayBufferView(Int32Array, valueOffsetsOrChildren); + return new Data(type, offset, length, nullCount, buffers, children); } } + +((Data.prototype as any).childData = Object.freeze([])); diff --git a/js/src/enum.ts b/js/src/enum.ts new file mode 100644 index 0000000000000..0be6a4ed2938e --- /dev/null +++ b/js/src/enum.ts @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; + +export import ArrowType = Schema_.org.apache.arrow.flatbuf.Type; +export import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +export import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +export import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +export import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; +export import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; +export import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; + +/** + * * + * Main data type enumeration: + * * + * Data types in this library are all *logical*. They can be expressed as + * either a primitive physical type (bytes or bits of some fixed size), a + * nested type consisting of other data types, or another data type (e.g. a + * timestamp encoded as an int64) + */ +export enum Type { + NONE = 0, // The default placeholder type + Null = 1, // A NULL type having no physical storage + Int = 2, // Signed or unsigned 8, 16, 32, or 64-bit little-endian integer + Float = 3, // 2, 4, or 8-byte floating point value + Binary = 4, // Variable-length bytes (no guarantee of UTF8-ness) + Utf8 = 5, // UTF8 variable-length string as List + Bool = 6, // Boolean as 1 bit, LSB bit-packed ordering + Decimal = 7, // Precision-and-scale-based decimal type. Storage type depends on the parameters. + Date = 8, // int32_t days or int64_t milliseconds since the UNIX epoch + Time = 9, // Time as signed 32 or 64-bit integer, representing either seconds, milliseconds, microseconds, or nanoseconds since midnight since midnight + Timestamp = 10, // Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond) + Interval = 11, // YEAR_MONTH or DAY_TIME interval in SQL style + List = 12, // A list of some logical data type + Struct = 13, // Struct of logical types + Union = 14, // Union of logical types + FixedSizeBinary = 15, // Fixed-size binary. Each value occupies the same number of bytes + FixedSizeList = 16, // Fixed-size list. Each value occupies the same number of bytes + Map = 17, // Map of named logical types + + // These enum values are here so that TypeScript can narrow the type signatures further + // beyond the base Arrow types. The base Arrow types include metadata like bitWidths that + // impact the type signatures of the values we return. For example, the Int8Vector reads + // 1-byte numbers from an Int8Array, an Int32Vector reads a 4-byte number from an Int32Array, + // and an Int64Vector reads a pair of 4-byte lo, hi int32s, and returns them as a zero-copy + // slice from an underlying Int32Array. Library consumers benefit by doing this type narrowing, + // since we can ensure the types across all public methods are propagated and never bail to `any`. + // These values are _never_ actually used at runtime, and they will _never_ be written into the + // flatbuffers metadata of serialized Arrow IPC payloads. + Dictionary = -1, // Dictionary aka Category type + Int8 = -2, + Int16 = -3, + Int32 = -4, + Int64 = -5, + Uint8 = -6, + Uint16 = -7, + Uint32 = -8, + Uint64 = -9, + Float16 = -10, + Float32 = -11, + Float64 = -12, + DateDay = -13, + DateMillisecond = -14, + TimestampSecond = -15, + TimestampMillisecond = -16, + TimestampMicrosecond = -17, + TimestampNanosecond = -18, + TimeSecond = -19, + TimeMillisecond = -20, + TimeMicrosecond = -21, + TimeNanosecond = -22, + DenseUnion = -23, + SparseUnion = -24, + IntervalDayTime = -25, + IntervalYearMonth = -26, +} diff --git a/js/src/fb/Schema.ts b/js/src/fb/Schema.ts index 4a4aeb65599be..e9829d9d8348a 100644 --- a/js/src/fb/Schema.ts +++ b/js/src/fb/Schema.ts @@ -588,7 +588,7 @@ export namespace org.apache.arrow.flatbuf { * @param {Array.} data * @returns {flatbuffers.Offset} */ - static createTypeIdsVector(builder: flatbuffers.Builder, data: number[] | Uint8Array): flatbuffers.Offset { + static createTypeIdsVector(builder: flatbuffers.Builder, data: number[] | Int32Array): flatbuffers.Offset { builder.startVector(4, data.length, 4); for (let i = data.length - 1; i >= 0; i--) { builder.addInt32(data[i]); diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts new file mode 100644 index 0000000000000..ae38d4e5be333 --- /dev/null +++ b/js/src/interfaces.ts @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { Type } from './enum'; +import * as type from './type'; +import { DataType } from './type'; +import * as vecs from './vector/index'; + +/** @ignore */ +export interface ArrayBufferViewConstructor { + readonly prototype: T; + new(length: number): T; + new(arrayOrArrayBuffer: ArrayLike | ArrayBufferLike): T; + new(buffer: ArrayBufferLike, byteOffset: number, length?: number): T; + /** + * The size in bytes of each element in the array. + */ + readonly BYTES_PER_ELEMENT: number; + /** + * Returns a new array from a set of elements. + * @param items A set of elements to include in the new array object. + */ + of(...items: number[]): T; + /** + * Creates an array from an array-like or iterable object. + * @param arrayLike An array-like or iterable object to convert to an array. + * @param mapfn A mapping function to call on every element of the array. + * @param thisArg Value of 'this' used to invoke the mapfn. + */ + from(arrayLike: ArrayLike, mapfn?: (v: number, k: number) => number, thisArg?: any): T; +} + +/** @ignore */ +export type VectorCtorArgs< + T extends Vector, + R extends DataType = any, + TArgs extends any[] = any[], + TCtor extends new (data: Data, ...args: TArgs) => T = + new (data: Data, ...args: TArgs) => T +> = TCtor extends new (data: Data, ...args: infer TArgs) => T ? TArgs : never; + +/** + * Obtain the constructor function of an instance type + * @ignore + */ +export type ConstructorType< + T, + TCtor extends new (...args: any[]) => T = + new (...args: any[]) => T +> = TCtor extends new (...args: any[]) => T ? TCtor : never; + +/** @ignore */ +export type VectorCtorType< + T extends Vector, + R extends DataType = any, + TCtor extends new (data: Data, ...args: VectorCtorArgs) => T = + new (data: Data, ...args: VectorCtorArgs) => T +> = TCtor extends new (data: Data, ...args: VectorCtorArgs) => T ? TCtor : never; + +/** @ignore */ +export type Vector = + T extends Type ? TypeToVector : + T extends DataType ? DataTypeToVector : + never + ; + +/** @ignore */ +export type VectorCtor = + T extends Vector ? VectorCtorType : + T extends Type ? VectorCtorType> : + T extends DataType ? VectorCtorType> : + VectorCtorType + ; + +/** @ignore */ +export type DataTypeCtor = + T extends DataType ? ConstructorType : + T extends Vector ? ConstructorType : + T extends Type ? ConstructorType> : + never + ; + +/** @ignore */ +type TypeToVector = + T extends Type.Null ? vecs.NullVector : + T extends Type.Bool ? vecs.BoolVector : + T extends Type.Int8 ? vecs.Int8Vector : + T extends Type.Int16 ? vecs.Int16Vector : + T extends Type.Int32 ? vecs.Int32Vector : + T extends Type.Int64 ? vecs.Int64Vector : + T extends Type.Uint8 ? vecs.Uint8Vector : + T extends Type.Uint16 ? vecs.Uint16Vector : + T extends Type.Uint32 ? vecs.Uint32Vector : + T extends Type.Uint64 ? vecs.Uint64Vector : + T extends Type.Int ? vecs.IntVector : + T extends Type.Float16 ? vecs.Float16Vector : + T extends Type.Float32 ? vecs.Float32Vector : + T extends Type.Float64 ? vecs.Float64Vector : + T extends Type.Float ? vecs.FloatVector : + T extends Type.Utf8 ? vecs.Utf8Vector : + T extends Type.Binary ? vecs.BinaryVector : + T extends Type.FixedSizeBinary ? vecs.FixedSizeBinaryVector : + T extends Type.Date ? vecs.DateVector : + T extends Type.DateDay ? vecs.DateDayVector : + T extends Type.DateMillisecond ? vecs.DateMillisecondVector : + T extends Type.Timestamp ? vecs.TimestampVector : + T extends Type.TimestampSecond ? vecs.TimestampSecondVector : + T extends Type.TimestampMillisecond ? vecs.TimestampMillisecondVector : + T extends Type.TimestampMicrosecond ? vecs.TimestampMicrosecondVector : + T extends Type.TimestampNanosecond ? vecs.TimestampNanosecondVector : + T extends Type.Time ? vecs.TimeVector : + T extends Type.TimeSecond ? vecs.TimeSecondVector : + T extends Type.TimeMillisecond ? vecs.TimeMillisecondVector : + T extends Type.TimeMicrosecond ? vecs.TimeMicrosecondVector : + T extends Type.TimeNanosecond ? vecs.TimeNanosecondVector : + T extends Type.Decimal ? vecs.DecimalVector : + T extends Type.Union ? vecs.UnionVector : + T extends Type.DenseUnion ? vecs.DenseUnionVector : + T extends Type.SparseUnion ? vecs.SparseUnionVector : + T extends Type.Interval ? vecs.IntervalVector : + T extends Type.IntervalDayTime ? vecs.IntervalDayTimeVector : + T extends Type.IntervalYearMonth ? vecs.IntervalYearMonthVector : + T extends Type.Map ? vecs.MapVector : + T extends Type.List ? vecs.ListVector : + T extends Type.Struct ? vecs.StructVector : + T extends Type.Dictionary ? vecs.DictionaryVector : + T extends Type.FixedSizeList ? vecs.FixedSizeListVector : + vecs.BaseVector + ; + +/** @ignore */ +type DataTypeToVector = + T extends type.Null ? vecs.NullVector : + T extends type.Bool ? vecs.BoolVector : + T extends type.Int8 ? vecs.Int8Vector : + T extends type.Int16 ? vecs.Int16Vector : + T extends type.Int32 ? vecs.Int32Vector : + T extends type.Int64 ? vecs.Int64Vector : + T extends type.Uint8 ? vecs.Uint8Vector : + T extends type.Uint16 ? vecs.Uint16Vector : + T extends type.Uint32 ? vecs.Uint32Vector : + T extends type.Uint64 ? vecs.Uint64Vector : + T extends type.Int ? vecs.IntVector : + T extends type.Float16 ? vecs.Float16Vector : + T extends type.Float32 ? vecs.Float32Vector : + T extends type.Float64 ? vecs.Float64Vector : + T extends type.Float ? vecs.FloatVector : + T extends type.Utf8 ? vecs.Utf8Vector : + T extends type.Binary ? vecs.BinaryVector : + T extends type.FixedSizeBinary ? vecs.FixedSizeBinaryVector : + T extends type.Date_ ? vecs.DateVector : + T extends type.DateDay ? vecs.DateDayVector : + T extends type.DateMillisecond ? vecs.DateMillisecondVector : + T extends type.Timestamp ? vecs.TimestampVector : + T extends type.TimestampSecond ? vecs.TimestampSecondVector : + T extends type.TimestampMillisecond ? vecs.TimestampMillisecondVector : + T extends type.TimestampMicrosecond ? vecs.TimestampMicrosecondVector : + T extends type.TimestampNanosecond ? vecs.TimestampNanosecondVector : + T extends type.Time ? vecs.TimeVector : + T extends type.TimeSecond ? vecs.TimeSecondVector : + T extends type.TimeMillisecond ? vecs.TimeMillisecondVector : + T extends type.TimeMicrosecond ? vecs.TimeMicrosecondVector : + T extends type.TimeNanosecond ? vecs.TimeNanosecondVector : + T extends type.Decimal ? vecs.DecimalVector : + T extends type.Union ? vecs.UnionVector : + T extends type.DenseUnion ? vecs.DenseUnionVector : + T extends type.SparseUnion ? vecs.SparseUnionVector : + T extends type.Interval ? vecs.IntervalVector : + T extends type.IntervalDayTime ? vecs.IntervalDayTimeVector : + T extends type.IntervalYearMonth ? vecs.IntervalYearMonthVector : + T extends type.Map_ ? vecs.MapVector : + T extends type.List ? vecs.ListVector : + T extends type.Struct ? vecs.StructVector : + T extends type.Dictionary ? vecs.DictionaryVector : + T extends type.FixedSizeList ? vecs.FixedSizeListVector : + vecs.BaseVector + ; + +/** @ignore */ +type TypeToDataType = + T extends Type.Null ? type.Null + : T extends Type.Bool ? type.Bool + : T extends Type.Int ? type.Int + : T extends Type.Int16 ? type.Int16 + : T extends Type.Int32 ? type.Int32 + : T extends Type.Int64 ? type.Int64 + : T extends Type.Uint8 ? type.Uint8 + : T extends Type.Uint16 ? type.Uint16 + : T extends Type.Uint32 ? type.Uint32 + : T extends Type.Uint64 ? type.Uint64 + : T extends Type.Int8 ? type.Int8 + : T extends Type.Float16 ? type.Float16 + : T extends Type.Float32 ? type.Float32 + : T extends Type.Float64 ? type.Float64 + : T extends Type.Float ? type.Float + : T extends Type.Utf8 ? type.Utf8 + : T extends Type.Binary ? type.Binary + : T extends Type.FixedSizeBinary ? type.FixedSizeBinary + : T extends Type.Date ? type.Date_ + : T extends Type.DateDay ? type.DateDay + : T extends Type.DateMillisecond ? type.DateMillisecond + : T extends Type.Timestamp ? type.Timestamp + : T extends Type.TimestampSecond ? type.TimestampSecond + : T extends Type.TimestampMillisecond ? type.TimestampMillisecond + : T extends Type.TimestampMicrosecond ? type.TimestampMicrosecond + : T extends Type.TimestampNanosecond ? type.TimestampNanosecond + : T extends Type.Time ? type.Time + : T extends Type.TimeSecond ? type.TimeSecond + : T extends Type.TimeMillisecond ? type.TimeMillisecond + : T extends Type.TimeMicrosecond ? type.TimeMicrosecond + : T extends Type.TimeNanosecond ? type.TimeNanosecond + : T extends Type.Decimal ? type.Decimal + : T extends Type.Union ? type.Union + : T extends Type.DenseUnion ? type.DenseUnion + : T extends Type.SparseUnion ? type.SparseUnion + : T extends Type.Interval ? type.Interval + : T extends Type.IntervalDayTime ? type.IntervalDayTime + : T extends Type.IntervalYearMonth ? type.IntervalYearMonth + : T extends Type.Map ? type.Map_ + : T extends Type.List ? type.List + : T extends Type.Struct ? type.Struct + : T extends Type.Dictionary ? type.Dictionary + : T extends Type.FixedSizeList ? type.FixedSizeList + : DataType + ; diff --git a/js/src/io/adapters.ts b/js/src/io/adapters.ts new file mode 100644 index 0000000000000..427fc29ab2228 --- /dev/null +++ b/js/src/io/adapters.ts @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { + toUint8Array, + joinUint8Arrays, + ArrayBufferViewInput, + toUint8ArrayIterator, + toUint8ArrayAsyncIterator +} from '../util/buffer'; + +import { ReadableDOMStreamOptions } from './interfaces'; + +/** @ignore */ +export default { + fromIterable(source: Iterable | T): IterableIterator { + return pump(fromIterable(source)); + }, + fromAsyncIterable(source: AsyncIterable | PromiseLike): AsyncIterableIterator { + return pump(fromAsyncIterable(source)); + }, + fromDOMStream(source: ReadableStream): AsyncIterableIterator { + return pump(fromDOMStream(source)); + }, + fromNodeStream(stream: NodeJS.ReadableStream): AsyncIterableIterator { + return pump(fromNodeStream(stream)); + }, + // @ts-ignore + toDOMStream(source: Iterable | AsyncIterable, options?: ReadableDOMStreamOptions): ReadableStream { + throw new Error(`"toDOMStream" not available in this environment`); + }, + // @ts-ignore + toNodeStream(source: Iterable | AsyncIterable, options?: import('stream').ReadableOptions): import('stream').Readable { + throw new Error(`"toNodeStream" not available in this environment`); + }, +}; + +/** @ignore */ +const pump = | AsyncIterator>(iterator: T) => { iterator.next(); return iterator; }; + +/** @ignore */ +function* fromIterable(source: Iterable | T): IterableIterator { + + let done: boolean, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before creating the source Iterator + ({ cmd, size } = yield null); + + // initialize the iterator + let it = toUint8ArrayIterator(source)[Symbol.iterator](); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) ? + it.next(undefined) : it.next(size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (typeof it.throw === 'function') && (it.throw(e)); + } finally { + (threw === false) && (typeof it.return === 'function') && (it.return()); + } +} + +/** @ignore */ +async function* fromAsyncIterable(source: AsyncIterable | PromiseLike): AsyncIterableIterator { + + let done: boolean, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before creating the source AsyncIterator + ({ cmd, size } = yield null); + + // initialize the iterator + let it = toUint8ArrayAsyncIterator(source)[Symbol.asyncIterator](); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) + ? await it.next(undefined) + : await it.next(size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (typeof it.throw === 'function') && (await it.throw(e)); + } finally { + (threw === false) && (typeof it.return === 'function') && (await it.return()); + } +} + +// All this manual Uint8Array chunk management can be avoided if/when engines +// add support for ArrayBuffer.transfer() or ArrayBuffer.prototype.realloc(): +// https://github.com/domenic/proposal-arraybuffer-transfer +/** @ignore */ +async function* fromDOMStream(source: ReadableStream): AsyncIterableIterator { + + let done = false, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before we establish the ReadableStream lock + ({ cmd, size } = yield null); + + // initialize the reader and lock the stream + let it = new AdaptiveByteReader(source); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) + ? await it['read'](undefined) + : await it['read'](size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(toUint8Array(buffer)); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (await it['cancel'](e)); + } finally { + (threw === false) ? (await it['cancel']()) + : source['locked'] && it.releaseLock(); + } +} + +/** @ignore */ +class AdaptiveByteReader { + + private supportsBYOB: boolean; + private byobReader: ReadableStreamBYOBReader | null = null; + private defaultReader: ReadableStreamDefaultReader | null = null; + private reader: ReadableStreamBYOBReader | ReadableStreamDefaultReader | null; + + constructor(private source: ReadableStream) { + try { + this.supportsBYOB = !!(this.reader = this.getBYOBReader()); + } catch (e) { + this.supportsBYOB = !!!(this.reader = this.getDefaultReader()); + } + } + + get closed(): Promise { + return this.reader ? this.reader['closed'].catch(() => {}) : Promise.resolve(); + } + + releaseLock(): void { + if (this.reader) { + this.reader.releaseLock(); + } + this.reader = this.byobReader = this.defaultReader = null; + } + + async cancel(reason?: any): Promise { + const { reader, source } = this; + reader && (await reader['cancel'](reason)); + source && (source['locked'] && this.releaseLock()); + } + + async read(size?: number): Promise> { + if (size === 0) { + return { done: this.reader == null, value: new Uint8Array(0) }; + } + const result = !this.supportsBYOB || typeof size !== 'number' + ? await this.getDefaultReader().read() + : await this.readFromBYOBReader(size); + !result.done && (result.value = toUint8Array(result as ReadableStreamReadResult)); + return result as ReadableStreamReadResult; + } + + private getDefaultReader() { + if (this.byobReader) { this.releaseLock(); } + if (!this.defaultReader) { + this.defaultReader = this.source['getReader'](); + // We have to catch and swallow errors here to avoid uncaught promise rejection exceptions + // that seem to be raised when we call `releaseLock()` on this reader. I'm still mystified + // about why these errors are raised, but I'm sure there's some important spec reason that + // I haven't considered. I hate to employ such an anti-pattern here, but it seems like the + // only solution in this case :/ + this.defaultReader['closed'].catch(() => {}); + } + return (this.reader = this.defaultReader); + } + + private getBYOBReader() { + if (this.defaultReader) { this.releaseLock(); } + if (!this.byobReader) { + this.byobReader = this.source['getReader']({ mode: 'byob' }); + // We have to catch and swallow errors here to avoid uncaught promise rejection exceptions + // that seem to be raised when we call `releaseLock()` on this reader. I'm still mystified + // about why these errors are raised, but I'm sure there's some important spec reason that + // I haven't considered. I hate to employ such an anti-pattern here, but it seems like the + // only solution in this case :/ + this.byobReader['closed'].catch(() => {}); + } + return (this.reader = this.byobReader); + } + + // This strategy plucked from the example in the streams spec: + // https://streams.spec.whatwg.org/#example-manual-read-bytes + private async readFromBYOBReader(size: number) { + return await readInto(this.getBYOBReader(), new ArrayBuffer(size), 0, size); + } +} + +/** @ignore */ +async function readInto(reader: ReadableStreamBYOBReader, buffer: ArrayBufferLike, offset: number, size: number): Promise> { + if (offset >= size) { + return { done: false, value: new Uint8Array(buffer, 0, size) }; + } + const { done, value } = await reader.read(new Uint8Array(buffer, offset, size - offset)); + if (((offset += value.byteLength) < size) && !done) { + return await readInto(reader, value.buffer, offset, size); + } + return { done, value: new Uint8Array(value.buffer, 0, offset) }; +} + +/** @ignore */ +type EventName = 'end' | 'error' | 'readable'; +/** @ignore */ +type Event = [EventName, (_: any) => void, Promise<[EventName, Error | null]>]; +/** @ignore */ +const onEvent = (stream: NodeJS.ReadableStream, event: T) => { + let handler = (_: any) => resolve([event, _]); + let resolve: (value?: [T, any] | PromiseLike<[T, any]>) => void; + return [event, handler, new Promise<[T, any]>( + (r) => (resolve = r) && stream['once'](event, handler) + )] as Event; +}; + +/** @ignore */ +async function* fromNodeStream(stream: NodeJS.ReadableStream): AsyncIterableIterator { + + let events: Event[] = []; + let event: EventName = 'error'; + let done = false, err: Error | null = null; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + let buffers: Uint8Array[] = [], buffer: Uint8Array | Buffer | string; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before we + // add the listener for the source stream's 'readable' event. + ({ cmd, size } = yield null); + + // ignore stdin if it's a TTY + if ((stream as any)['isTTY']) { return yield new Uint8Array(0); } + + try { + // initialize the stream event handlers + events[0] = onEvent(stream, 'end'); + events[1] = onEvent(stream, 'error'); + + do { + events[2] = onEvent(stream, 'readable'); + + // wait on the first message event from the stream + [event, err] = await Promise.race(events.map((x) => x[2])); + + // if the stream emitted an Error, rethrow it + if (event === 'error') { break; } + if (!(done = event === 'end')) { + // If the size is NaN, request to read everything in the stream's internal buffer + if (!isFinite(size - bufferLength)) { + buffer = toUint8Array(stream['read'](undefined)); + } else { + buffer = toUint8Array(stream['read'](size - bufferLength)); + // If the byteLength is 0, then the requested amount is more than the stream has + // in its internal buffer. In this case the stream needs a "kick" to tell it to + // continue emitting readable events, so request to read everything the stream + // has in its internal buffer right now. + if (buffer.byteLength < (size - bufferLength)) { + buffer = toUint8Array(stream['read'](undefined)); + } + } + // if chunk is not null or empty, push it onto the queue + if (buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } finally { + await cleanup(events, event === 'error' ? err : null); + } + + function cleanup(events: Event[], err?: T) { + buffer = buffers = null; + return new Promise(async (resolve, reject) => { + for (const [evt, fn] of events) { + stream['off'](evt, fn); + } + try { + // Some stream implementations don't call the destroy callback, + // because it's really a node-internal API. Just calling `destroy` + // here should be enough to conform to the ReadableStream contract + const destroy = (stream as any)['destroy']; + destroy && destroy.call(stream, err); + err = undefined; + } catch (e) { err = e || err; } finally { + err != null ? reject(err) : resolve(); + } + }); + } +} diff --git a/js/src/io/file.ts b/js/src/io/file.ts new file mode 100644 index 0000000000000..d88bc5f6f4e56 --- /dev/null +++ b/js/src/io/file.ts @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FileHandle } from './interfaces'; +import { ByteStream, AsyncByteStream } from './stream'; +import { ArrayBufferViewInput, toUint8Array } from '../util/buffer'; + +/** @ignore */ +export class RandomAccessFile extends ByteStream { + public size: number; + public position: number = 0; + protected buffer: Uint8Array | null; + constructor(buffer: ArrayBufferViewInput, byteLength?: number) { + super(); + this.buffer = toUint8Array(buffer); + this.size = typeof byteLength === 'undefined' ? this.buffer.byteLength : byteLength; + } + public readInt32(position: number) { + const { buffer, byteOffset } = this.readAt(position, 4); + return new DataView(buffer, byteOffset).getInt32(0, true); + } + public seek(position: number) { + this.position = Math.min(position, this.size); + return position < this.size; + } + public read(nBytes?: number | null) { + const { buffer, size, position } = this; + if (buffer && position < size) { + if (typeof nBytes !== 'number') { nBytes = Infinity; } + this.position = Math.min(size, + position + Math.min(size - position, nBytes)); + return buffer.subarray(position, this.position); + } + return null; + } + public readAt(position: number, nBytes: number) { + const buf = this.buffer; + const end = Math.min(this.size, position + nBytes); + return buf ? buf.subarray(position, end) : new Uint8Array(nBytes); + } + public close() { this.buffer && (this.buffer = null); } + public throw(value?: any) { this.close(); return { done: true, value }; } + public return(value?: any) { this.close(); return { done: true, value }; } +} + +/** @ignore */ +export class AsyncRandomAccessFile extends AsyncByteStream { + // @ts-ignore + public size: number; + public position: number = 0; + public _pending?: Promise; + protected _handle: FileHandle | null; + constructor(file: FileHandle, byteLength?: number) { + super(); + this._handle = file; + if (typeof byteLength === 'number') { + this.size = byteLength; + } else { + this._pending = (async () => { + delete this._pending; + this.size = (await file.stat()).size; + })(); + } + } + public async readInt32(position: number) { + const { buffer, byteOffset } = await this.readAt(position, 4); + return new DataView(buffer, byteOffset).getInt32(0, true); + } + public async seek(position: number) { + this._pending && await this._pending; + this.position = Math.min(position, this.size); + return position < this.size; + } + public async read(nBytes?: number | null) { + this._pending && await this._pending; + const { _handle: file, size, position } = this; + if (file && position < size) { + if (typeof nBytes !== 'number') { nBytes = Infinity; } + let pos = position, offset = 0, bytesRead = 0; + let end = Math.min(size, pos + Math.min(size - pos, nBytes)); + let buffer = new Uint8Array(Math.max(0, (this.position = end) - pos)); + while ((pos += bytesRead) < end && (offset += bytesRead) < buffer.byteLength) { + ({ bytesRead } = await file.read(buffer, offset, buffer.byteLength - offset, pos)); + } + return buffer; + } + return null; + } + public async readAt(position: number, nBytes: number) { + this._pending && await this._pending; + const { _handle: file, size } = this; + if (file && (position + nBytes) < size) { + const end = Math.min(size, position + nBytes); + const buffer = new Uint8Array(end - position); + return (await file.read(buffer, 0, nBytes, position)).buffer; + } + return new Uint8Array(nBytes); + } + public async close() { const f = this._handle; this._handle = null; f && await f.close(); } + public async throw(value?: any) { await this.close(); return { done: true, value }; } + public async return(value?: any) { await this.close(); return { done: true, value }; } +} diff --git a/js/src/io/interfaces.ts b/js/src/io/interfaces.ts new file mode 100644 index 0000000000000..9892562e0c0ec --- /dev/null +++ b/js/src/io/interfaces.ts @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './adapters'; + +/** @ignore */ +export const ITERATOR_DONE: any = Object.freeze({ done: true, value: void (0) }); + +/** @ignore */ +export type FileHandle = import('fs').promises.FileHandle; +/** @ignore */ +export type ArrowJSONLike = { schema: any; batches?: any[]; dictionaries?: any[]; }; +/** @ignore */ +export type ReadableDOMStreamOptions = { type: 'bytes' | undefined, autoAllocateChunkSize?: number, highWaterMark?: number }; + +/** @ignore */ +export class ArrowJSON { + // @ts-ignore + constructor(private _json: ArrowJSONLike) {} + public get schema(): any { return this._json['schema']; } + public get batches(): any[] { return (this._json['batches'] || []) as any[]; } + public get dictionaries(): any[] { return (this._json['dictionaries'] || []) as any[]; } +} + +/** @ignore */ +export interface Readable { + + readonly closed: Promise; + cancel(reason?: any): Promise; + + read(size?: number | null): Promise; + peek(size?: number | null): Promise; + throw(value?: any): Promise>; + return(value?: any): Promise>; + next(size?: number | null): Promise>; +} + +/** @ignore */ +export interface Writable { + readonly closed: Promise; + close(): void; + write(chunk: T): void; + abort(reason?: any): void; +} + +/** @ignore */ +export interface ReadableWritable extends Readable, Writable { + [Symbol.asyncIterator](): AsyncIterableIterator; + toDOMStream(options?: ReadableDOMStreamOptions): ReadableStream; + toNodeStream(options?: import('stream').ReadableOptions): import('stream').Readable; +} + +/** @ignore */ +export abstract class ReadableInterop { + + public abstract toDOMStream(options?: ReadableDOMStreamOptions): ReadableStream; + public abstract toNodeStream(options?: import('stream').ReadableOptions): import('stream').Readable; + + public tee(): [ReadableStream, ReadableStream] { + return this._getDOMStream().tee(); + } + public pipe(writable: R, options?: { end?: boolean; }) { + return this._getNodeStream().pipe(writable, options); + } + public pipeTo(writable: WritableStream, options?: PipeOptions) { return this._getDOMStream().pipeTo(writable, options); } + public pipeThrough>(duplex: { writable: WritableStream, readable: R }, options?: PipeOptions) { + return this._getDOMStream().pipeThrough(duplex, options); + } + + private _DOMStream?: ReadableStream; + private _getDOMStream() { + return this._DOMStream || (this._DOMStream = this.toDOMStream()); + } + + private _nodeStream?: import('stream').Readable; + private _getNodeStream() { + return this._nodeStream || (this._nodeStream = this.toNodeStream()); + } +} + +/** @ignore */ +type Resolution = { resolve: (value?: T | PromiseLike) => void; reject: (reason?: any) => void; }; + +/** @ignore */ +export class AsyncQueue extends ReadableInterop + implements AsyncIterableIterator, ReadableWritable { + + protected _values: TWritable[] = []; + protected _error?: { error: any; }; + protected _closedPromise: Promise; + protected _closedPromiseResolve?: (value?: any) => void; + protected resolvers: Resolution>[] = []; + + constructor() { + super(); + this._closedPromise = new Promise((r) => this._closedPromiseResolve = r); + } + + public get closed(): Promise { return this._closedPromise; } + public async cancel(reason?: any) { await this.return(reason); } + public write(value: TWritable) { + if (this._ensureOpen()) { + this.resolvers.length <= 0 + ? (this._values.push(value)) + : (this.resolvers.shift()!.resolve({ done: false, value } as any)); + } + } + public abort(value?: any) { + if (this._closedPromiseResolve) { + this.resolvers.length <= 0 + ? (this._error = { error: value }) + : (this.resolvers.shift()!.reject({ done: true, value })); + } + } + public close() { + if (this._closedPromiseResolve) { + const { resolvers } = this; + while (resolvers.length > 0) { + resolvers.shift()!.resolve(ITERATOR_DONE); + } + this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + } + } + + public [Symbol.asyncIterator]() { return this; } + public toDOMStream(options?: ReadableDOMStreamOptions) { + return streamAdapters.toDOMStream( + (this._closedPromiseResolve || this._error) + ? (this as AsyncIterable) + : (this._values as any) as Iterable, + options); + } + public toNodeStream(options?: import('stream').ReadableOptions) { + return streamAdapters.toNodeStream( + (this._closedPromiseResolve || this._error) + ? (this as AsyncIterable) + : (this._values as any) as Iterable, + options); + } + public async throw(_?: any) { await this.abort(_); return ITERATOR_DONE; } + public async return(_?: any) { await this.close(); return ITERATOR_DONE; } + + public async read(size?: number | null): Promise { return (await this.next(size, 'read')).value; } + public async peek(size?: number | null): Promise { return (await this.next(size, 'peek')).value; } + public next(..._args: any[]): Promise> { + if (this._values.length > 0) { + return Promise.resolve({ done: false, value: this._values.shift()! } as any); + } else if (this._error) { + return Promise.reject({ done: true, value: this._error.error }); + } else if (!this._closedPromiseResolve) { + return Promise.resolve(ITERATOR_DONE); + } else { + return new Promise>((resolve, reject) => { + this.resolvers.push({ resolve, reject }); + }); + } + } + + protected _ensureOpen() { + if (this._closedPromiseResolve) { + return true; + } + throw new Error(`${this} is closed`); + } +} diff --git a/js/src/io/stream.ts b/js/src/io/stream.ts new file mode 100644 index 0000000000000..2fe686532a5e5 --- /dev/null +++ b/js/src/io/stream.ts @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './adapters'; +import { decodeUtf8 } from '../util/utf8'; +import { ITERATOR_DONE, Readable, Writable, AsyncQueue } from './interfaces'; +import { toUint8Array, joinUint8Arrays, ArrayBufferViewInput } from '../util/buffer'; + +import { + isPromise, isFetchResponse, + isIterable, isAsyncIterable, + isReadableDOMStream, isReadableNodeStream +} from '../util/compat'; + +/** @ignore */ +export type WritableSink = Writable | WritableStream | NodeJS.WritableStream | null; +/** @ignore */ +export type ReadableSource = Readable | PromiseLike | AsyncIterable | ReadableStream | NodeJS.ReadableStream | null; + +/** @ignore */ +export class AsyncByteQueue extends AsyncQueue { + public write(value: ArrayBufferViewInput | Uint8Array) { + if ((value = toUint8Array(value)).byteLength > 0) { + return super.write(value as T); + } + } + public toString(sync: true): string; + public toString(sync?: false): Promise; + public toString(sync = false) { + return sync + ? decodeUtf8(this.toUint8Array(true)) + : this.toUint8Array(false).then(decodeUtf8); + } + public toUint8Array(sync: true): Uint8Array; + public toUint8Array(sync?: false): Promise; + public toUint8Array(sync = false) { + return sync ? joinUint8Arrays(this._values as any[])[0] : (async () => { + let buffers = [], byteLength = 0; + for await (const chunk of this) { + buffers.push(chunk); + byteLength += chunk.byteLength; + } + return joinUint8Arrays(buffers, byteLength)[0]; + })(); + } +} + +/** @ignore */ +export class ByteStream implements IterableIterator { + // @ts-ignore + private source: ByteStreamSource; + constructor(source?: Iterable | ArrayBufferViewInput) { + if (source) { + this.source = new ByteStreamSource(streamAdapters.fromIterable(source)); + } + } + [Symbol.iterator]() { return this; } + public next(value?: any) { return this.source.next(value); } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public peek(size?: number | null) { return this.source.peek(size); } + public read(size?: number | null) { return this.source.read(size); } +} + +/** @ignore */ +export class AsyncByteStream implements Readable, AsyncIterableIterator { + // @ts-ignore + private source: AsyncByteStreamSource; + constructor(source?: PromiseLike | Response | ReadableStream | NodeJS.ReadableStream | AsyncIterable | Iterable) { + if (source instanceof AsyncByteStream) { + this.source = (source as AsyncByteStream).source; + } else if (source instanceof AsyncByteQueue) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isReadableNodeStream(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromNodeStream(source)); + } else if (isFetchResponse(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromDOMStream(source.body!)); + } else if (isIterable(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromIterable(source)); + } else if (isPromise(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isAsyncIterable(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isReadableDOMStream(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromDOMStream(source)); + } + } + [Symbol.asyncIterator]() { return this; } + public next(value?: any) { return this.source.next(value); } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public get closed(): Promise { return this.source.closed; } + public cancel(reason?: any) { return this.source.cancel(reason); } + public peek(size?: number | null) { return this.source.peek(size); } + public read(size?: number | null) { return this.source.read(size); } +} + +/** @ignore */ +interface ByteStreamSourceIterator extends IterableIterator { + next(value?: { cmd: 'peek' | 'read', size?: number | null }): IteratorResult; +} + +/** @ignore */ +interface AsyncByteStreamSourceIterator extends AsyncIterableIterator { + next(value?: { cmd: 'peek' | 'read', size?: number | null }): Promise>; +} + +/** @ignore */ +class ByteStreamSource { + constructor(protected source: ByteStreamSourceIterator) {} + public cancel(reason?: any) { this.return(reason); } + public peek(size?: number | null): T | null { return this.next(size, 'peek').value; } + public read(size?: number | null): T | null { return this.next(size, 'read').value; } + public next(size?: number | null, cmd: 'peek' | 'read' = 'read') { return this.source.next({ cmd, size }); } + public throw(value?: any) { return Object.create((this.source.throw && this.source.throw(value)) || ITERATOR_DONE); } + public return(value?: any) { return Object.create((this.source.return && this.source.return(value)) || ITERATOR_DONE); } +} + +/** @ignore */ +class AsyncByteStreamSource implements Readable { + + private _closedPromise: Promise; + private _closedPromiseResolve?: (value?: any) => void; + constructor (protected source: ByteStreamSourceIterator | AsyncByteStreamSourceIterator) { + this._closedPromise = new Promise((r) => this._closedPromiseResolve = r); + } + public async cancel(reason?: any) { await this.return(reason); } + public get closed(): Promise { return this._closedPromise; } + public async read(size?: number | null): Promise { return (await this.next(size, 'read')).value; } + public async peek(size?: number | null): Promise { return (await this.next(size, 'peek')).value; } + public async next(size?: number | null, cmd: 'peek' | 'read' = 'read') { return (await this.source.next({ cmd, size })); } + public async throw(value?: any) { + const result = (this.source.throw && await this.source.throw(value)) || ITERATOR_DONE; + this._closedPromiseResolve && this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + return Object.create(result); + } + public async return(value?: any) { + const result = (this.source.return && await this.source.return(value)) || ITERATOR_DONE; + this._closedPromiseResolve && this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + return Object.create(result); + } +} diff --git a/js/src/ipc/magic.ts b/js/src/ipc/magic.ts deleted file mode 100644 index 0688d1a2d1e19..0000000000000 --- a/js/src/ipc/magic.ts +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import ByteBuffer = flatbuffers.ByteBuffer; - -export const PADDING = 4; -export const MAGIC_STR = 'ARROW1'; -export const MAGIC = new Uint8Array(MAGIC_STR.length); - -for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { - MAGIC[i] = MAGIC_STR.charCodeAt(i); -} - -export function checkForMagicArrowString(buffer: Uint8Array, index = 0) { - for (let i = -1, n = MAGIC.length; ++i < n;) { - if (MAGIC[i] !== buffer[index + i]) { - return false; - } - } - return true; -} - -export function isValidArrowFile(bb: ByteBuffer) { - let fileLength = bb.capacity(), footerLength: number, lengthOffset: number; - if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || - (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || - (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || - (/* Invalid footer length */ - (footerLength = bb.readInt32(lengthOffset = fileLength - magicAndPadding)) < 1 && - (footerLength + lengthOffset > fileLength))) { - return false; - } - return true; -} - -export const magicLength = MAGIC.length; -export const magicAndPadding = magicLength + PADDING; -export const magicX2AndPadding = magicLength * 2 + PADDING; diff --git a/js/src/ipc/message.ts b/js/src/ipc/message.ts new file mode 100644 index 0000000000000..194e4ac7f679d --- /dev/null +++ b/js/src/ipc/message.ts @@ -0,0 +1,249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { MessageHeader } from '../enum'; +import { flatbuffers } from 'flatbuffers'; +import ByteBuffer = flatbuffers.ByteBuffer; +import { Message } from './metadata/message'; +import { isFileHandle } from '../util/compat'; +import { AsyncRandomAccessFile } from '../io/file'; +import { toUint8Array, ArrayBufferViewInput } from '../util/buffer'; +import { ByteStream, ReadableSource, AsyncByteStream } from '../io/stream'; +import { ArrowJSON, ArrowJSONLike, ITERATOR_DONE, FileHandle } from '../io/interfaces'; + +/** @ignore */ const invalidMessageType = (type: MessageHeader) => `Expected ${MessageHeader[type]} Message in stream, but was null or length 0.`; +/** @ignore */ const nullMessage = (type: MessageHeader) => `Header pointer of flatbuffer-encoded ${MessageHeader[type]} Message is null or length 0.`; +/** @ignore */ const invalidMessageMetadata = (expected: number, actual: number) => `Expected to read ${expected} metadata bytes, but only read ${actual}.`; +/** @ignore */ const invalidMessageBodyLength = (expected: number, actual: number) => `Expected to read ${expected} bytes for message body, but only read ${actual}.`; + +/** @ignore */ +export class MessageReader implements IterableIterator { + protected source: ByteStream; + constructor(source: ByteStream | ArrayBufferViewInput | Iterable) { + this.source = source instanceof ByteStream ? source : new ByteStream(source); + } + public [Symbol.iterator](): IterableIterator { return this as IterableIterator; } + public next(): IteratorResult { + let r; + if ((r = this.readMetadataLength()).done) { return ITERATOR_DONE; } + if ((r = this.readMetadata(r.value)).done) { return ITERATOR_DONE; } + return ( r) as IteratorResult; + } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public readMessage(type?: T | null) { + let r: IteratorResult>; + if ((r = this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public readMessageBody(bodyLength: number): Uint8Array { + if (bodyLength <= 0) { return new Uint8Array(0); } + const buf = toUint8Array(this.source.read(bodyLength)); + if (buf.byteLength < bodyLength) { + throw new Error(invalidMessageBodyLength(bodyLength, buf.byteLength)); + } + // 1. Work around bugs in fs.ReadStream's internal Buffer pooling, see: https://github.com/nodejs/node/issues/24817 + // 2. Work around https://github.com/whatwg/streams/blob/0ebe4b042e467d9876d80ae045de3843092ad797/reference-implementation/lib/helpers.js#L126 + return /* 1. */ (buf.byteOffset % 8 === 0) && + /* 2. */ (buf.byteOffset + buf.byteLength) <= buf.buffer.byteLength ? buf : buf.slice(); + } + public readSchema(throwIfNull = false) { + const type = MessageHeader.Schema; + const message = this.readMessage(type); + const schema = message && message.header(); + if (throwIfNull && !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } + protected readMetadataLength(): IteratorResult { + const buf = this.source.read(PADDING); + const bb = buf && new ByteBuffer(buf); + const len = +(bb && bb.readInt32(0))!; + return { done: len <= 0, value: len }; + } + protected readMetadata(metadataLength: number): IteratorResult { + const buf = this.source.read(metadataLength); + if (!buf) { return ITERATOR_DONE; } + if (buf.byteLength < metadataLength) { + throw new Error(invalidMessageMetadata(metadataLength, buf.byteLength)); + } + return { done: false, value: Message.decode(buf) }; + } +} + +/** @ignore */ +export class AsyncMessageReader implements AsyncIterableIterator { + protected source: AsyncByteStream; + constructor(source: ReadableSource); + constructor(source: FileHandle, byteLength?: number); + constructor(source: any, byteLength?: number) { + this.source = source instanceof AsyncByteStream ? source + : isFileHandle(source) + ? new AsyncRandomAccessFile(source, byteLength!) + : new AsyncByteStream(source); + } + public [Symbol.asyncIterator](): AsyncIterableIterator { return this as AsyncIterableIterator; } + public async next(): Promise> { + let r; + if ((r = await this.readMetadataLength()).done) { return ITERATOR_DONE; } + if ((r = await this.readMetadata(r.value)).done) { return ITERATOR_DONE; } + return ( r) as IteratorResult; + } + public async throw(value?: any) { return await this.source.throw(value); } + public async return(value?: any) { return await this.source.return(value); } + public async readMessage(type?: T | null) { + let r: IteratorResult>; + if ((r = await this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public async readMessageBody(bodyLength: number): Promise { + if (bodyLength <= 0) { return new Uint8Array(0); } + const buf = toUint8Array(await this.source.read(bodyLength)); + if (buf.byteLength < bodyLength) { + throw new Error(invalidMessageBodyLength(bodyLength, buf.byteLength)); + } + // 1. Work around bugs in fs.ReadStream's internal Buffer pooling, see: https://github.com/nodejs/node/issues/24817 + // 2. Work around https://github.com/whatwg/streams/blob/0ebe4b042e467d9876d80ae045de3843092ad797/reference-implementation/lib/helpers.js#L126 + return /* 1. */ (buf.byteOffset % 8 === 0) && + /* 2. */ (buf.byteOffset + buf.byteLength) <= buf.buffer.byteLength ? buf : buf.slice(); + } + public async readSchema(throwIfNull = false) { + const type = MessageHeader.Schema; + const message = await this.readMessage(type); + const schema = message && message.header(); + if (throwIfNull && !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } + protected async readMetadataLength(): Promise> { + const buf = await this.source.read(PADDING); + const bb = buf && new ByteBuffer(buf); + const len = +(bb && bb.readInt32(0))!; + return { done: len <= 0, value: len }; + } + protected async readMetadata(metadataLength: number): Promise> { + const buf = await this.source.read(metadataLength); + if (!buf) { return ITERATOR_DONE; } + if (buf.byteLength < metadataLength) { + throw new Error(invalidMessageMetadata(metadataLength, buf.byteLength)); + } + return { done: false, value: Message.decode(buf) }; + } +} + +/** @ignore */ +export class JSONMessageReader extends MessageReader { + private _schema = false; + private _json: ArrowJSON; + private _body: any[] = []; + private _batchIndex = 0; + private _dictionaryIndex = 0; + constructor(source: ArrowJSON | ArrowJSONLike) { + super(new Uint8Array(0)); + this._json = source instanceof ArrowJSON ? source : new ArrowJSON(source); + } + public next() { + const { _json, _batchIndex, _dictionaryIndex } = this; + const numBatches = _json.batches.length; + const numDictionaries = _json.dictionaries.length; + if (!this._schema) { + this._schema = true; + const message = Message.fromJSON(_json.schema, MessageHeader.Schema); + return { value: message, done: _batchIndex >= numBatches && _dictionaryIndex >= numDictionaries }; + } + if (_dictionaryIndex < numDictionaries) { + const batch = _json.dictionaries[this._dictionaryIndex++]; + this._body = batch['data']['columns']; + const message = Message.fromJSON(batch, MessageHeader.DictionaryBatch); + return { done: false, value: message }; + } + if (_batchIndex < numBatches) { + const batch = _json.batches[this._batchIndex++]; + this._body = batch['columns']; + const message = Message.fromJSON(batch, MessageHeader.RecordBatch); + return { done: false, value: message }; + } + this._body = []; + return ITERATOR_DONE; + } + public readMessageBody(_bodyLength?: number) { + return flattenDataSources(this._body) as any; + function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['TYPE'] && [column['TYPE']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); + } + } + public readMessage(type?: T | null) { + let r: IteratorResult>; + if ((r = this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public readSchema() { + const type = MessageHeader.Schema; + const message = this.readMessage(type); + const schema = message && message.header(); + if (!message || !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } +} + +/** @ignore */ +export const PADDING = 4; +/** @ignore */ +export const MAGIC_STR = 'ARROW1'; +/** @ignore */ +export const MAGIC = new Uint8Array(MAGIC_STR.length); + +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +/** @ignore */ +export function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +/** @ignore */ +export const magicLength = MAGIC.length; +/** @ignore */ +export const magicAndPadding = magicLength + PADDING; +/** @ignore */ +export const magicX2AndPadding = magicLength * 2 + PADDING; diff --git a/js/src/ipc/metadata.ts b/js/src/ipc/metadata.ts deleted file mode 100644 index 025b051734295..0000000000000 --- a/js/src/ipc/metadata.ts +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* tslint:disable:class-name */ - -import { Schema, Long, MessageHeader, MetadataVersion } from '../type'; - -export class Footer { - constructor(public dictionaryBatches: FileBlock[], public recordBatches: FileBlock[], public schema: Schema) {} -} - -export class FileBlock { - public offset: number; - public bodyLength: number; - constructor(public metaDataLength: number, bodyLength: Long | number, offset: Long | number) { - this.offset = typeof offset === 'number' ? offset : offset.low; - this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; - } -} - -export class Message { - public bodyLength: number; - public version: MetadataVersion; - public headerType: MessageHeader; - constructor(version: MetadataVersion, bodyLength: Long | number, headerType: MessageHeader) { - this.version = version; - this.headerType = headerType; - this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; - } - static isSchema(m: Message): m is Schema { return m.headerType === MessageHeader.Schema; } - static isRecordBatch(m: Message): m is RecordBatchMetadata { return m.headerType === MessageHeader.RecordBatch; } - static isDictionaryBatch(m: Message): m is DictionaryBatch { return m.headerType === MessageHeader.DictionaryBatch; } -} - -export class RecordBatchMetadata extends Message { - public length: number; - public nodes: FieldMetadata[]; - public buffers: BufferMetadata[]; - constructor(version: MetadataVersion, length: Long | number, nodes: FieldMetadata[], buffers: BufferMetadata[], bodyLength?: Long | number) { - if (bodyLength === void(0)) { - bodyLength = buffers.reduce((bodyLength, buffer) => bodyLength + buffer.length, 0); - } - super(version, bodyLength, MessageHeader.RecordBatch); - this.nodes = nodes; - this.buffers = buffers; - this.length = typeof length === 'number' ? length : length.low; - } -} - -export class DictionaryBatch extends Message { - public id: number; - public isDelta: boolean; - public data: RecordBatchMetadata; - constructor(version: MetadataVersion, data: RecordBatchMetadata, id: Long | number, isDelta: boolean = false) { - super(version, data.bodyLength, MessageHeader.DictionaryBatch); - this.isDelta = isDelta; - this.data = data; - this.id = typeof id === 'number' ? id : id.low; - } - private static atomicDictionaryId = 0; - public static getId() { return DictionaryBatch.atomicDictionaryId++; } - public get nodes(): FieldMetadata[] { return this.data.nodes; } - public get buffers(): BufferMetadata[] { return this.data.buffers; } -} - -export class BufferMetadata { - public offset: number; - public length: number; - constructor(offset: Long | number, length: Long | number) { - this.offset = typeof offset === 'number' ? offset : offset.low; - this.length = typeof length === 'number' ? length : length.low; - } -} - -export class FieldMetadata { - public length: number; - public nullCount: number; - constructor(length: Long | number, nullCount: Long | number) { - this.length = typeof length === 'number' ? length : length.low; - this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; - } -} diff --git a/js/src/ipc/metadata/file.ts b/js/src/ipc/metadata/file.ts new file mode 100644 index 0000000000000..d7786fbbf9324 --- /dev/null +++ b/js/src/ipc/metadata/file.ts @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable:class-name */ + +import * as File_ from '../../fb/File'; +import { flatbuffers } from 'flatbuffers'; + +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; +import ByteBuffer = flatbuffers.ByteBuffer; +import _Block = File_.org.apache.arrow.flatbuf.Block; +import _Footer = File_.org.apache.arrow.flatbuf.Footer; + +import { Schema } from '../../schema'; +import { MetadataVersion } from '../../enum'; +import { toUint8Array } from '../../util/buffer'; +import { ArrayBufferViewInput } from '../../util/buffer'; + +/** @ignore */ +class Footer_ { + + /** @nocollapse */ + public static decode(buf: ArrayBufferViewInput) { + buf = new ByteBuffer(toUint8Array(buf)); + const footer = _Footer.getRootAsFooter(buf); + const schema = Schema.decode(footer.schema()!); + return new OffHeapFooter(schema, footer) as Footer_; + } + + /** @nocollapse */ + public static encode(footer: Footer_) { + + const b: Builder = new Builder(); + const schemaOffset = Schema.encode(b, footer.schema); + + _Footer.startRecordBatchesVector(b, footer.numRecordBatches); + [...footer.recordBatches()].slice().reverse().forEach((rb) => FileBlock.encode(b, rb)); + const recordBatchesOffset = b.endVector(); + + _Footer.startDictionariesVector(b, footer.numDictionaries); + [...footer.dictionaryBatches()].slice().reverse().forEach((db) => FileBlock.encode(b, db)); + + const dictionaryBatchesOffset = b.endVector(); + + _Footer.startFooter(b); + _Footer.addSchema(b, schemaOffset); + _Footer.addVersion(b, MetadataVersion.V4); + _Footer.addRecordBatches(b, recordBatchesOffset); + _Footer.addDictionaries(b, dictionaryBatchesOffset); + _Footer.finishFooterBuffer(b, _Footer.endFooter(b)); + + return b.asUint8Array(); + } + + // @ts-ignore + protected _recordBatches: FileBlock[]; + // @ts-ignore + protected _dictionaryBatches: FileBlock[]; + public get numRecordBatches() { return this._recordBatches.length; } + public get numDictionaries() { return this._dictionaryBatches.length; } + + constructor(public schema: Schema, + public version: MetadataVersion = MetadataVersion.V4, + recordBatches?: FileBlock[], dictionaryBatches?: FileBlock[]) { + recordBatches && (this._recordBatches = recordBatches); + dictionaryBatches && (this._dictionaryBatches = dictionaryBatches); + } + + public *recordBatches(): Iterable { + for (let block, i = -1, n = this.numRecordBatches; ++i < n;) { + if (block = this.getRecordBatch(i)) { yield block; } + } + } + + public *dictionaryBatches(): Iterable { + for (let block, i = -1, n = this.numDictionaries; ++i < n;) { + if (block = this.getDictionaryBatch(i)) { yield block; } + } + } + + public getRecordBatch(index: number) { + return index >= 0 + && index < this.numRecordBatches + && this._recordBatches[index] || null; + } + + public getDictionaryBatch(index: number) { + return index >= 0 + && index < this.numDictionaries + && this._dictionaryBatches[index] || null; + } +} + +export { Footer_ as Footer }; + +/** @ignore */ +class OffHeapFooter extends Footer_ { + + public get numRecordBatches() { return this._footer.recordBatchesLength(); } + public get numDictionaries() { return this._footer.dictionariesLength(); } + + constructor(schema: Schema, protected _footer: _Footer) { + super(schema, _footer.version()); + } + + public getRecordBatch(index: number) { + if (index >= 0 && index < this.numRecordBatches) { + const fileBlock = this._footer.recordBatches(index); + if (fileBlock) { return FileBlock.decode(fileBlock); } + } + return null; + } + + public getDictionaryBatch(index: number) { + if (index >= 0 && index < this.numDictionaries) { + const fileBlock = this._footer.dictionaries(index); + if (fileBlock) { return FileBlock.decode(fileBlock); } + } + return null; + } +} + +/** @ignore */ +export class FileBlock { + + /** @nocollapse */ + public static decode(block: _Block) { + return new FileBlock(block.metaDataLength(), block.bodyLength(), block.offset()); + } + + /** @nocollapse */ + public static encode(b: Builder, fileBlock: FileBlock) { + const { metaDataLength } = fileBlock; + const offset = new Long(fileBlock.offset, 0); + const bodyLength = new Long(fileBlock.bodyLength, 0); + return _Block.createBlock(b, offset, metaDataLength, bodyLength); + } + + public offset: number; + public bodyLength: number; + public metaDataLength: number; + + constructor(metaDataLength: number, bodyLength: Long | number, offset: Long | number) { + this.metaDataLength = metaDataLength; + this.offset = typeof offset === 'number' ? offset : offset.low; + this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } +} diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts new file mode 100644 index 0000000000000..fa219b3e7853b --- /dev/null +++ b/js/src/ipc/metadata/json.ts @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Schema, Field } from '../../schema'; +import { + DataType, Dictionary, TimeBitWidth, + Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, +} from '../../type'; + +import { DictionaryBatch, RecordBatch, FieldNode, BufferRegion } from './message'; +import { TimeUnit, Precision, IntervalUnit, UnionMode, DateUnit } from '../../enum'; + +/** @ignore */ +export function schemaFromJSON(_schema: any, dictionaries: Map = new Map(), dictionaryFields: Map[]> = new Map()) { + return new Schema( + schemaFieldsFromJSON(_schema, dictionaries, dictionaryFields), + customMetadataFromJSON(_schema['customMetadata']), + dictionaries, dictionaryFields + ); +} + +/** @ignore */ +export function recordBatchFromJSON(b: any) { + return new RecordBatch( + b['count'], + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +/** @ignore */ +export function dictionaryBatchFromJSON(b: any) { + return new DictionaryBatch( + recordBatchFromJSON(b['data']), + b['id'], b['isDelta'] + ); +} + +/** @ignore */ +function schemaFieldsFromJSON(_schema: any, dictionaries?: Map, dictionaryFields?: Map[]>) { + return (_schema['fields'] || []).filter(Boolean).map((f: any) => Field.fromJSON(f, dictionaries, dictionaryFields)); +} + +/** @ignore */ +function fieldChildrenFromJSON(_field: any, dictionaries?: Map, dictionaryFields?: Map[]>): Field[] { + return (_field['children'] || []).filter(Boolean).map((f: any) => Field.fromJSON(f, dictionaries, dictionaryFields)); +} + +/** @ignore */ +function fieldNodesFromJSON(xs: any[]): FieldNode[] { + return (xs || []).reduce((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldNode( + column['count'], + nullCountFromJSON(column['VALIDITY']) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldNode[]); +} + +/** @ignore */ +function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new BufferRegion(buffers.length, column['VALIDITY'].length)); + column['TYPE'] && buffers.push(new BufferRegion(buffers.length, column['TYPE'].length)); + column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); + column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +/** @ignore */ +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +/** @ignore */ +export function fieldFromJSON(_field: any, dictionaries?: Map, dictionaryFields?: Map[]>) { + + let id: number; + let keys: TKeys | null; + let field: Field | void; + let dictMeta: any; + let type: DataType; + let dictType: Dictionary; + let dictField: Field; + + // If no dictionary encoding, or in the process of decoding the children of a dictionary-encoded field + if (!dictionaries || !dictionaryFields || !(dictMeta = _field['dictionary'])) { + type = typeFromJSON(_field, fieldChildrenFromJSON(_field, dictionaries, dictionaryFields)); + field = new Field(_field['name'], type, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + } + // tslint:disable + // If dictionary encoded and the first time we've seen this dictionary id, decode + // the data type and child fields, then wrap in a Dictionary type and insert the + // data type into the dictionary types map. + else if (!dictionaries.has(id = dictMeta['id'])) { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); + dictionaries.set(id, type = typeFromJSON(_field, fieldChildrenFromJSON(_field))); + dictType = new Dictionary(type, keys, id, dictMeta['isOrdered']); + dictField = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + dictionaryFields.set(id, [field = dictField]); + } + // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the + // data type and wrap in a new Dictionary type and field. + else { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); + dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta['isOrdered']); + dictField = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + dictionaryFields.get(id)!.push(field = dictField); + } + return field || null; +} + +/** @ignore */ +function customMetadataFromJSON(_metadata?: object) { + return new Map(Object.entries(_metadata || {})); +} + +/** @ignore */ +function indexTypeFromJSON(_type: any) { + return new Int(_type['isSigned'], _type['bitWidth']); +} + +/** @ignore */ +function typeFromJSON(f: any, children?: Field[]): DataType { + + const typeId = f['type']['name']; + + switch (typeId) { + case 'NONE': return new DataType(); + case 'null': return new Null(); + case 'binary': return new Binary(); + case 'utf8': return new Utf8(); + case 'bool': return new Bool(); + case 'list': return new List((children || [])[0]); + case 'struct': return new Struct(children || []); + case 'struct_': return new Struct(children || []); + } + + switch (typeId) { + case 'int': { + const t = f['type']; + return new Int(t['isSigned'], t['bitWidth'] as IntBitWidth); + } + case 'floatingpoint': { + const t = f['type']; + return new Float(Precision[t['precision']] as any); + } + case 'decimal': { + const t = f['type']; + return new Decimal(t['scale'], t['precision']); + } + case 'date': { + const t = f['type']; + return new Date_(DateUnit[t['unit']] as any); + } + case 'time': { + const t = f['type']; + return new Time(TimeUnit[t['unit']] as any, t['bitWidth'] as TimeBitWidth); + } + case 'timestamp': { + const t = f['type']; + return new Timestamp(TimeUnit[t['unit']] as any, t['timezone']); + } + case 'interval': { + const t = f['type']; + return new Interval(IntervalUnit[t['unit']] as any); + } + case 'union': { + const t = f['type']; + return new Union(UnionMode[t['mode']] as any, (t['typeIds'] || []), children || []); + } + case 'fixedsizebinary': { + const t = f['type']; + return new FixedSizeBinary(t['byteWidth']); + } + case 'fixedsizelist': { + const t = f['type']; + return new FixedSizeList(t['listSize'], (children || [])[0]); + } + case 'map': { + const t = f['type']; + return new Map_(children || [], t['keysSorted']); + } + } + throw new Error(`Unrecognized type: "${typeId}"`); +} diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts new file mode 100644 index 0000000000000..d1ab219cd943f --- /dev/null +++ b/js/src/ipc/metadata/message.ts @@ -0,0 +1,595 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { flatbuffers } from 'flatbuffers'; +import * as Schema_ from '../../fb/Schema'; +import * as Message_ from '../../fb/Message'; + +import { Schema, Field } from '../../schema'; +import { toUint8Array } from '../../util/buffer'; +import { ArrayBufferViewInput } from '../../util/buffer'; +import { MessageHeader, MetadataVersion } from '../../enum'; +import { instance as typeAssembler } from '../../visitor/typeassembler'; +import { fieldFromJSON, schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from './json'; + +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; +import ByteBuffer = flatbuffers.ByteBuffer; +import _Int = Schema_.org.apache.arrow.flatbuf.Int; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import _Field = Schema_.org.apache.arrow.flatbuf.Field; +import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; +import _Message = Message_.org.apache.arrow.flatbuf.Message; +import _KeyValue = Schema_.org.apache.arrow.flatbuf.KeyValue; +import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; +import _Endianness = Schema_.org.apache.arrow.flatbuf.Endianness; +import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; + +import { + DataType, Dictionary, TimeBitWidth, + Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, +} from '../../type'; + +/** @ignore */ +export class Message { + + /** @nocollapse */ + public static fromJSON(msg: any, headerType: T): Message { + const message = new Message(0, MetadataVersion.V4, headerType); + message._createHeader = messageHeaderFromJSON(msg, headerType); + return message; + } + + /** @nocollapse */ + public static decode(buf: ArrayBufferViewInput) { + buf = new ByteBuffer(toUint8Array(buf)); + const _message = _Message.getRootAsMessage(buf); + const bodyLength: Long = _message.bodyLength()!; + const version: MetadataVersion = _message.version(); + const headerType: MessageHeader = _message.headerType(); + const message = new Message(bodyLength, version, headerType); + message._createHeader = decodeMessageHeader(_message, headerType); + return message; + } + + /** @nocollapse */ + public static encode(message: Message) { + let b = new Builder(), headerOffset = -1; + if (message.isSchema()) { + headerOffset = Schema.encode(b, message.header() as Schema); + } else if (message.isRecordBatch()) { + headerOffset = RecordBatch.encode(b, message.header() as RecordBatch); + } else if (message.isDictionaryBatch()) { + headerOffset = DictionaryBatch.encode(b, message.header() as DictionaryBatch); + } + _Message.startMessage(b); + _Message.addVersion(b, MetadataVersion.V4); + _Message.addHeader(b, headerOffset); + _Message.addHeaderType(b, message.headerType); + _Message.addBodyLength(b, new Long(message.bodyLength, 0)); + _Message.finishMessageBuffer(b, _Message.endMessage(b)); + return b.asUint8Array(); + } + + /** @nocollapse */ + public static from(header: Schema | RecordBatch | DictionaryBatch, bodyLength = 0) { + if (header instanceof Schema) { + return new Message(0, MetadataVersion.V4, MessageHeader.Schema, header); + } + if (header instanceof RecordBatch) { + return new Message(bodyLength, MetadataVersion.V4, MessageHeader.RecordBatch, header); + } + if (header instanceof DictionaryBatch) { + return new Message(bodyLength, MetadataVersion.V4, MessageHeader.DictionaryBatch, header); + } + throw new Error(`Unrecognized Message header: ${header}`); + } + + // @ts-ignore + public body: Uint8Array; + protected _headerType: T; + protected _bodyLength: number; + protected _version: MetadataVersion; + public get type() { return this.headerType; } + public get version() { return this._version; } + public get headerType() { return this._headerType; } + public get bodyLength() { return this._bodyLength; } + // @ts-ignore + protected _createHeader: MessageHeaderDecoder; + public header() { return this._createHeader(); } + public isSchema(): this is Message { return this.headerType === MessageHeader.Schema; } + public isRecordBatch(): this is Message { return this.headerType === MessageHeader.RecordBatch; } + public isDictionaryBatch(): this is Message { return this.headerType === MessageHeader.DictionaryBatch; } + + constructor(bodyLength: Long | number, version: MetadataVersion, headerType: T, header?: any) { + this._version = version; + this._headerType = headerType; + this.body = new Uint8Array(0); + header && (this._createHeader = () => header); + this._bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } +} + +/** @ignore */ +export class RecordBatch { + protected _length: number; + protected _nodes: FieldNode[]; + protected _buffers: BufferRegion[]; + public get nodes() { return this._nodes; } + public get length() { return this._length; } + public get buffers() { return this._buffers; } + constructor(length: Long | number, nodes: FieldNode[], buffers: BufferRegion[]) { + this._nodes = nodes; + this._buffers = buffers; + this._length = typeof length === 'number' ? length : length.low; + } +} + +/** @ignore */ +export class DictionaryBatch { + + protected _id: number; + protected _isDelta: boolean; + protected _data: RecordBatch; + public get id() { return this._id; } + public get data() { return this._data; } + public get isDelta() { return this._isDelta; } + public get length(): number { return this.data.length; } + public get nodes(): FieldNode[] { return this.data.nodes; } + public get buffers(): BufferRegion[] { return this.data.buffers; } + + constructor(data: RecordBatch, id: Long | number, isDelta: boolean = false) { + this._data = data; + this._isDelta = isDelta; + this._id = typeof id === 'number' ? id : id.low; + } +} + +/** @ignore */ +export class BufferRegion { + public offset: number; + public length: number; + constructor(offset: Long | number, length: Long | number) { + this.offset = typeof offset === 'number' ? offset : offset.low; + this.length = typeof length === 'number' ? length : length.low; + } +} + +/** @ignore */ +export class FieldNode { + public length: number; + public nullCount: number; + constructor(length: Long | number, nullCount: Long | number) { + this.length = typeof length === 'number' ? length : length.low; + this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; + } +} + +/** @ignore */ +function messageHeaderFromJSON(message: any, type: MessageHeader) { + return (() => { + switch (type) { + case MessageHeader.Schema: return Schema.fromJSON(message); + case MessageHeader.RecordBatch: return RecordBatch.fromJSON(message); + case MessageHeader.DictionaryBatch: return DictionaryBatch.fromJSON(message); + } + throw new Error(`Unrecognized Message type: { name: ${MessageHeader[type]}, type: ${type} }`); + }) as MessageHeaderDecoder; +} + +/** @ignore */ +function decodeMessageHeader(message: _Message, type: MessageHeader) { + return (() => { + switch (type) { + case MessageHeader.Schema: return Schema.decode(message.header(new _Schema())!); + case MessageHeader.RecordBatch: return RecordBatch.decode(message.header(new _RecordBatch())!, message.version()); + case MessageHeader.DictionaryBatch: return DictionaryBatch.decode(message.header(new _DictionaryBatch())!, message.version()); + } + throw new Error(`Unrecognized Message type: { name: ${MessageHeader[type]}, type: ${type} }`); + }) as MessageHeaderDecoder; +} + +Field['encode'] = encodeField; +Field['decode'] = decodeField; +Field['fromJSON'] = fieldFromJSON; + +Schema['encode'] = encodeSchema; +Schema['decode'] = decodeSchema; +Schema['fromJSON'] = schemaFromJSON; + +RecordBatch['encode'] = encodeRecordBatch; +RecordBatch['decode'] = decodeRecordBatch; +RecordBatch['fromJSON'] = recordBatchFromJSON; + +DictionaryBatch['encode'] = encodeDictionaryBatch; +DictionaryBatch['decode'] = decodeDictionaryBatch; +DictionaryBatch['fromJSON'] = dictionaryBatchFromJSON; + +FieldNode['encode'] = encodeFieldNode; +FieldNode['decode'] = decodeFieldNode; + +BufferRegion['encode'] = encodeBufferRegion; +BufferRegion['decode'] = decodeBufferRegion; + +declare module '../../schema' { + namespace Field { + export { encodeField as encode }; + export { decodeField as decode }; + export { fieldFromJSON as fromJSON }; + } + namespace Schema { + export { encodeSchema as encode }; + export { decodeSchema as decode }; + export { schemaFromJSON as fromJSON }; + } +} + +declare module './message' { + namespace RecordBatch { + export { encodeRecordBatch as encode }; + export { decodeRecordBatch as decode }; + export { recordBatchFromJSON as fromJSON }; + } + namespace DictionaryBatch { + export { encodeDictionaryBatch as encode }; + export { decodeDictionaryBatch as decode }; + export { dictionaryBatchFromJSON as fromJSON }; + } + namespace FieldNode { + export { encodeFieldNode as encode }; + export { decodeFieldNode as decode }; + } + namespace BufferRegion { + export { encodeBufferRegion as encode }; + export { decodeBufferRegion as decode }; + } +} + +/** @ignore */ +function decodeSchema(_schema: _Schema, dictionaries: Map = new Map(), dictionaryFields: Map[]> = new Map()) { + const fields = decodeSchemaFields(_schema, dictionaries, dictionaryFields); + return new Schema(fields, decodeCustomMetadata(_schema), dictionaries, dictionaryFields); +} + +/** @ignore */ +function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V4) { + return new RecordBatch(batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version)); +} + +/** @ignore */ +function decodeDictionaryBatch(batch: _DictionaryBatch, version = MetadataVersion.V4) { + return new DictionaryBatch(RecordBatch.decode(batch.data()!, version), batch.id(), batch.isDelta()); +} + +/** @ignore */ +function decodeBufferRegion(b: _Buffer) { + return new BufferRegion(b.offset(), b.length()); +} + +/** @ignore */ +function decodeFieldNode(f: _FieldNode) { + return new FieldNode(f.length(), f.nullCount()); +} + +/** @ignore */ +function decodeFieldNodes(batch: _RecordBatch) { + const nodes = [] as FieldNode[]; + for (let f, i = -1, j = -1, n = batch.nodesLength(); ++i < n;) { + if (f = batch.nodes(i)) { + nodes[++j] = FieldNode.decode(f); + } + } + return nodes; +} + +/** @ignore */ +function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { + const bufferRegions = [] as BufferRegion[]; + for (let b, i = -1, j = -1, n = batch.buffersLength(); ++i < n;) { + if (b = batch.buffers(i)) { + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page_id field + if (version < MetadataVersion.V4) { + b.bb_pos += (8 * (i + 1)); + } + bufferRegions[++j] = BufferRegion.decode(b); + } + } + return bufferRegions; +} + +/** @ignore */ +function decodeSchemaFields(schema: _Schema, dictionaries?: Map, dictionaryFields?: Map[]>) { + const fields = [] as Field[]; + for (let f, i = -1, j = -1, n = schema.fieldsLength(); ++i < n;) { + if (f = schema.fields(i)) { + fields[++j] = Field.decode(f, dictionaries, dictionaryFields); + } + } + return fields; +} + +/** @ignore */ +function decodeFieldChildren(field: _Field, dictionaries?: Map, dictionaryFields?: Map[]>): Field[] { + const children = [] as Field[]; + for (let f, i = -1, j = -1, n = field.childrenLength(); ++i < n;) { + if (f = field.children(i)) { + children[++j] = Field.decode(f, dictionaries, dictionaryFields); + } + } + return children; +} + +/** @ignore */ +function decodeField(f: _Field, dictionaries?: Map, dictionaryFields?: Map[]>) { + + let id: number; + let field: Field | void; + let type: DataType; + let keys: _Int | TKeys | null; + let dictType: Dictionary; + let dictMeta: _DictionaryEncoding | null; + let dictField: Field; + + // If no dictionary encoding, or in the process of decoding the children of a dictionary-encoded field + if (!dictionaries || !dictionaryFields || !(dictMeta = f.dictionary())) { + type = decodeFieldType(f, decodeFieldChildren(f, dictionaries, dictionaryFields)); + field = new Field(f.name()!, type, f.nullable(), decodeCustomMetadata(f)); + } + // tslint:disable + // If dictionary encoded and the first time we've seen this dictionary id, decode + // the data type and child fields, then wrap in a Dictionary type and insert the + // data type into the dictionary types map. + else if (!dictionaries.has(id = dictMeta.id().low)) { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta.indexType()) ? decodeIndexType(keys) as TKeys : new Int32(); + dictionaries.set(id, type = decodeFieldType(f, decodeFieldChildren(f))); + dictType = new Dictionary(type, keys, id, dictMeta.isOrdered()); + dictField = new Field(f.name()!, dictType, f.nullable(), decodeCustomMetadata(f)); + dictionaryFields.set(id, [field = dictField]); + } + // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the + // data type and wrap in a new Dictionary type and field. + else { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta.indexType()) ? decodeIndexType(keys) as TKeys : new Int32(); + dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta.isOrdered()); + dictField = new Field(f.name()!, dictType, f.nullable(), decodeCustomMetadata(f)); + dictionaryFields.get(id)!.push(field = dictField); + } + return field || null; +} + +/** @ignore */ +function decodeCustomMetadata(parent?: _Schema | _Field | null) { + const data = new Map(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +/** @ignore */ +function decodeIndexType(_type: _Int) { + return new Int(_type.isSigned(), _type.bitWidth() as IntBitWidth); +} + +/** @ignore */ +function decodeFieldType(f: _Field, children?: Field[]): DataType { + + const typeId = f.typeType(); + + switch (typeId) { + case Type.NONE: return new DataType(); + case Type.Null: return new Null(); + case Type.Binary: return new Binary(); + case Type.Utf8: return new Utf8(); + case Type.Bool: return new Bool(); + case Type.List: return new List((children || [])[0]); + case Type.Struct_: return new Struct(children || []); + } + + switch (typeId) { + case Type.Int: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Int())!; + return new Int(t.isSigned(), t.bitWidth()); + } + case Type.FloatingPoint: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.FloatingPoint())!; + return new Float(t.precision()); + } + case Type.Decimal: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Decimal())!; + return new Decimal(t.scale(), t.precision()); + } + case Type.Date: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Date())!; + return new Date_(t.unit()); + } + case Type.Time: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Time())!; + return new Time(t.unit(), t.bitWidth() as TimeBitWidth); + } + case Type.Timestamp: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Timestamp())!; + return new Timestamp(t.unit(), t.timezone()); + } + case Type.Interval: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Interval())!; + return new Interval(t.unit()); + } + case Type.Union: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Union())!; + return new Union(t.mode(), t.typeIdsArray() || [], children || []); + } + case Type.FixedSizeBinary: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.FixedSizeBinary())!; + return new FixedSizeBinary(t.byteWidth()); + } + case Type.FixedSizeList: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.FixedSizeList())!; + return new FixedSizeList(t.listSize(), (children || [])[0]); + } + case Type.Map: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Map())!; + return new Map_(children || [], t.keysSorted()); + } + } + throw new Error(`Unrecognized type: "${Type[typeId]}" (${typeId})`); +} + +/** @ignore */ +function encodeSchema(b: Builder, schema: Schema) { + + const fieldOffsets = schema.fields.map((f) => Field.encode(b, f)); + + _Schema.startFieldsVector(b, fieldOffsets.length); + + const fieldsVectorOffset = _Schema.createFieldsVector(b, fieldOffsets); + + const metadataOffset = !(schema.metadata && schema.metadata.size > 0) ? -1 : + _Schema.createCustomMetadataVector(b, [...schema.metadata].map(([k, v]) => { + const key = b.createString(`${k}`); + const val = b.createString(`${v}`); + _KeyValue.startKeyValue(b); + _KeyValue.addKey(b, key); + _KeyValue.addValue(b, val); + return _KeyValue.endKeyValue(b); + })); + + _Schema.startSchema(b); + _Schema.addFields(b, fieldsVectorOffset); + _Schema.addEndianness(b, platformIsLittleEndian ? _Endianness.Little : _Endianness.Big); + + if (metadataOffset !== -1) { _Schema.addCustomMetadata(b, metadataOffset); } + + return _Schema.endSchema(b); +} + +/** @ignore */ +function encodeField(b: Builder, field: Field) { + + let nameOffset = -1; + let typeOffset = -1; + let dictionaryOffset = -1; + + let type = field.type; + let typeId: Type = field.typeId; + + if (!DataType.isDictionary(type)) { + typeOffset = typeAssembler.visit(type, b)!; + } else { + typeId = type.dictionary.typeId; + dictionaryOffset = typeAssembler.visit(type, b)!; + typeOffset = typeAssembler.visit(type.dictionary, b)!; + } + + const childOffsets = (type.children || []).map((f: Field) => Field.encode(b, f)); + const childrenVectorOffset = _Field.createChildrenVector(b, childOffsets); + + const metadataOffset = !(field.metadata && field.metadata.size > 0) ? -1 : + _Field.createCustomMetadataVector(b, [...field.metadata].map(([k, v]) => { + const key = b.createString(`${k}`); + const val = b.createString(`${v}`); + _KeyValue.startKeyValue(b); + _KeyValue.addKey(b, key); + _KeyValue.addValue(b, val); + return _KeyValue.endKeyValue(b); + })); + + if (field.name) { + nameOffset = b.createString(field.name); + } + + _Field.startField(b); + _Field.addType(b, typeOffset); + _Field.addTypeType(b, typeId); + _Field.addChildren(b, childrenVectorOffset); + _Field.addNullable(b, !!field.nullable); + + if (nameOffset !== -1) { _Field.addName(b, nameOffset); } + if (dictionaryOffset !== -1) { _Field.addDictionary(b, dictionaryOffset); } + if (metadataOffset !== -1) { _Field.addCustomMetadata(b, metadataOffset); } + + return _Field.endField(b); +} + +/** @ignore */ +function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { + + const nodes = recordBatch.nodes || []; + const buffers = recordBatch.buffers || []; + + _RecordBatch.startNodesVector(b, nodes.length); + nodes.slice().reverse().forEach((n) => FieldNode.encode(b, n)); + + const nodesVectorOffset = b.endVector(); + + _RecordBatch.startBuffersVector(b, buffers.length); + buffers.slice().reverse().forEach((b_) => BufferRegion.encode(b, b_)); + + const buffersVectorOffset = b.endVector(); + + _RecordBatch.startRecordBatch(b); + _RecordBatch.addLength(b, new Long(recordBatch.length, 0)); + _RecordBatch.addNodes(b, nodesVectorOffset); + _RecordBatch.addBuffers(b, buffersVectorOffset); + return _RecordBatch.endRecordBatch(b); +} + +/** @ignore */ +function encodeDictionaryBatch(b: Builder, dictionaryBatch: DictionaryBatch) { + const dataOffset = RecordBatch.encode(b, dictionaryBatch.data); + _DictionaryBatch.startDictionaryBatch(b); + _DictionaryBatch.addId(b, new Long(dictionaryBatch.id, 0)); + _DictionaryBatch.addIsDelta(b, dictionaryBatch.isDelta); + _DictionaryBatch.addData(b, dataOffset); + return _DictionaryBatch.endDictionaryBatch(b); +} + +/** @ignore */ +function encodeFieldNode(b: Builder, node: FieldNode) { + return _FieldNode.createFieldNode(b, new Long(node.length, 0), new Long(node.nullCount, 0)); +} + +/** @ignore */ +function encodeBufferRegion(b: Builder, node: BufferRegion) { + return _Buffer.createBuffer(b, new Long(node.offset, 0), new Long(node.length, 0)); +} + +/** @ignore */ +const platformIsLittleEndian = (function() { + const buffer = new ArrayBuffer(2); + new DataView(buffer).setInt16(0, 256, true /* littleEndian */); + // Int16Array uses the platform's endianness. + return new Int16Array(buffer)[0] === 256; +})(); + +/** @ignore */ +type MessageHeaderDecoder = () => T extends MessageHeader.Schema ? Schema + : T extends MessageHeader.RecordBatch ? RecordBatch + : T extends MessageHeader.DictionaryBatch ? DictionaryBatch : never; diff --git a/js/src/ipc/node/iterable.ts b/js/src/ipc/node/iterable.ts new file mode 100644 index 0000000000000..eb5542a1c542d --- /dev/null +++ b/js/src/ipc/node/iterable.ts @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Readable } from 'stream'; +import { isIterable, isAsyncIterable } from '../../util/compat'; + +/** @ignore */ +type ReadableOptions = import('stream').ReadableOptions; + +/** @ignore */ +export function toNodeStream(source: Iterable | AsyncIterable, options?: ReadableOptions): Readable { + if (isAsyncIterable(source)) { return new AsyncIterableReadable(source[Symbol.asyncIterator](), options); } + if (isIterable(source)) { return new IterableReadable(source[Symbol.iterator](), options); } + /* istanbul ignore next */ + throw new Error(`toNodeStream() must be called with an Iterable or AsyncIterable`); +} + +/** @ignore */ +class IterableReadable extends Readable { + private _pulling: boolean; + private _bytesMode: boolean; + private _iterator: Iterator; + constructor(it: Iterator, options?: ReadableOptions) { + super(options); + this._iterator = it; + this._pulling = false; + this._bytesMode = !options || !options.objectMode; + } + _read(size: number) { + const it = this._iterator; + if (it && !this._pulling && (this._pulling = true)) { + this._pulling = this._pull(size, it); + } + } + _destroy(e: Error | null, cb: (e: Error | null) => void) { + let it = this._iterator, fn: any; + it && (fn = e != null && it.throw || it.return); + fn && fn.call(it, e); + cb && cb(null); + } + private _pull(size: number, it: Iterator) { + const bm = this._bytesMode; + let r: IteratorResult | null = null; + while (this.readable && !(r = it.next(bm ? size : null)).done) { + if (size != null) { + size -= (bm && ArrayBuffer.isView(r.value) ? r.value.byteLength : 1); + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r && r.done || !this.readable) && (this.push(null) || true)) { + it.return && it.return(); + } + return !this.readable; + } +} + +/** @ignore */ +class AsyncIterableReadable extends Readable { + private _pulling: boolean; + private _bytesMode: boolean; + private _iterator: AsyncIterator; + constructor(it: AsyncIterator, options?: ReadableOptions) { + super(options); + this._iterator = it; + this._pulling = false; + this._bytesMode = !options || !options.objectMode; + } + _read(size: number) { + const it = this._iterator; + if (it && !this._pulling && (this._pulling = true)) { + (async () => this._pulling = await this._pull(size, it))(); + } + } + _destroy(e: Error | null, cb: (e: Error | null) => void) { + let it = this._iterator, fn: any; + it && (fn = e != null && it.throw || it.return); + fn && fn.call(it, e).then(() => cb && cb(null)) || (cb && cb(null)); + } + private async _pull(size: number, it: AsyncIterator) { + const bm = this._bytesMode; + let r: IteratorResult | null = null; + while (this.readable && !(r = await it.next(bm ? size : null)).done) { + if (size != null) { + size -= (bm && ArrayBuffer.isView(r.value) ? r.value.byteLength : 1); + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r && r.done || !this.readable) && (this.push(null) || true)) { + it.return && it.return(); + } + return !this.readable; + } +} diff --git a/js/src/ipc/node/reader.ts b/js/src/ipc/node/reader.ts new file mode 100644 index 0000000000000..ca19eecb58f40 --- /dev/null +++ b/js/src/ipc/node/reader.ts @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Duplex, DuplexOptions } from 'stream'; +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteQueue } from '../../io/stream'; +import { RecordBatchReader } from '../../ipc/reader'; + +/** @ignore */ +export function recordBatchReaderThroughNodeStream(options?: DuplexOptions & { autoDestroy: boolean }) { + return new RecordBatchReaderDuplex(options); +} + +/** @ignore */ +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class RecordBatchReaderDuplex extends Duplex { + private _pulling: boolean = false; + private _autoDestroy: boolean = true; + private _reader: RecordBatchReader | null; + private _asyncQueue: AsyncByteQueue | null; + constructor(options?: DuplexOptions & { autoDestroy: boolean }) { + super({ allowHalfOpen: false, ...options, readableObjectMode: true, writableObjectMode: false }); + this._reader = null; + this._pulling = false; + this._asyncQueue = new AsyncByteQueue(); + this._autoDestroy = options && (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true; + } + _final(cb?: CB) { + const aq = this._asyncQueue; + aq && aq.close(); + cb && cb(); + } + _write(x: any, _: string, cb: CB) { + const aq = this._asyncQueue; + aq && aq.write(x); + cb && cb(); + return true; + } + _read(size: number) { + const aq = this._asyncQueue; + if (aq && !this._pulling && (this._pulling = true)) { + (async () => { + if (!this._reader) { + this._reader = await this._open(aq); + } + this._pulling = await this._pull(size, this._reader); + })(); + } + } + _destroy(err: Error | null, cb: (error: Error | null) => void) { + const aq = this._asyncQueue; + if (aq) { err ? aq.abort(err) : aq.close(); } + cb(this._asyncQueue = this._reader = null); + } + async _open(source: AsyncByteQueue) { + return await (await RecordBatchReader.from(source)).open({ autoDestroy: this._autoDestroy }); + } + async _pull(size: number, reader: RecordBatchReader) { + let r: IteratorResult> | null = null; + while (this.readable && !(r = await reader.next()).done) { + if (!this.push(r.value) || (size != null && --size <= 0)) { break; } + } + if ((r && r.done || !this.readable)) { + this.push(null); + await reader.cancel(); + } + return !this.readable; + } +} diff --git a/js/src/ipc/node/writer.ts b/js/src/ipc/node/writer.ts new file mode 100644 index 0000000000000..a1b31efa990c3 --- /dev/null +++ b/js/src/ipc/node/writer.ts @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Duplex, DuplexOptions } from 'stream'; +import { DataType } from '../../type'; +import { AsyncByteStream } from '../../io/stream'; +import { RecordBatchWriter } from '../../ipc/writer'; + +/** @ignore */ +export function recordBatchWriterThroughNodeStream(this: typeof RecordBatchWriter, options?: DuplexOptions & { autoDestroy: boolean }) { + return new RecordBatchWriterDuplex(new this(options)); +} + +/** @ignore */ +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class RecordBatchWriterDuplex extends Duplex { + private _pulling: boolean = false; + private _reader: AsyncByteStream | null; + private _writer: RecordBatchWriter | null; + constructor(writer: RecordBatchWriter, options?: DuplexOptions) { + super({ allowHalfOpen: false, ...options, writableObjectMode: true, readableObjectMode: false }); + this._writer = writer; + this._reader = new AsyncByteStream(writer); + } + _final(cb?: CB) { + const writer = this._writer; + writer && writer.close(); + cb && cb(); + } + _write(x: any, _: string, cb: CB) { + const writer = this._writer; + writer && writer.write(x); + cb && cb(); + return true; + } + _read(size: number) { + const it = this._reader; + if (it && !this._pulling && (this._pulling = true)) { + (async () => this._pulling = await this._pull(size, it))(); + } + } + _destroy(err: Error | null, cb: (error: Error | null) => void) { + const writer = this._writer; + if (writer) { err ? writer.abort(err) : writer.close(); } + cb(this._reader = this._writer = null); + } + async _pull(size: number, reader: AsyncByteStream) { + let r: IteratorResult | null = null; + while (this.readable && !(r = await reader.next(size || null)).done) { + if (size != null && r.value) { + size -= r.value.byteLength; + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r && r.done || !this.readable)) { + this.push(null); + await reader.cancel(); + } + return !this.readable; + } +} diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts new file mode 100644 index 0000000000000..91990afb35b17 --- /dev/null +++ b/js/src/ipc/reader.ts @@ -0,0 +1,737 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../type'; +import { Vector } from '../vector'; +import { MessageHeader } from '../enum'; +import { Footer } from './metadata/file'; +import { Schema, Field } from '../schema'; +import streamAdapters from '../io/adapters'; +import { Message } from './metadata/message'; +import { RecordBatch } from '../recordbatch'; +import * as metadata from './metadata/message'; +import { ArrayBufferViewInput } from '../util/buffer'; +import { ByteStream, AsyncByteStream } from '../io/stream'; +import { RandomAccessFile, AsyncRandomAccessFile } from '../io/file'; +import { VectorLoader, JSONVectorLoader } from '../visitor/vectorloader'; +import { + FileHandle, + ArrowJSONLike, + ITERATOR_DONE, + ReadableInterop, +} from '../io/interfaces'; +import { + MessageReader, AsyncMessageReader, JSONMessageReader, + checkForMagicArrowString, magicLength, magicAndPadding, magicX2AndPadding +} from './message'; +import { + isPromise, + isIterable, isAsyncIterable, + isIteratorResult, isArrowJSON, + isFileHandle, isFetchResponse, + isReadableDOMStream, isReadableNodeStream +} from '../util/compat'; + +/** @ignore */ export type FromArg0 = ArrowJSONLike; +/** @ignore */ export type FromArg1 = PromiseLike; +/** @ignore */ export type FromArg2 = Iterable | ArrayBufferViewInput; +/** @ignore */ export type FromArg3 = PromiseLike | ArrayBufferViewInput>; +/** @ignore */ export type FromArg4 = Response | NodeJS.ReadableStream | ReadableStream | AsyncIterable; +/** @ignore */ export type FromArg5 = FileHandle | PromiseLike | PromiseLike; +/** @ignore */ export type FromArgs = FromArg0 | FromArg1 | FromArg2 | FromArg3 | FromArg4 | FromArg5; + +/** @ignore */ type OpenOptions = { autoDestroy?: boolean; }; +/** @ignore */ type RecordBatchReaders = RecordBatchFileReader | RecordBatchStreamReader; +/** @ignore */ type AsyncRecordBatchReaders = AsyncRecordBatchFileReader | AsyncRecordBatchStreamReader; +/** @ignore */ type RecordBatchFileReaders = RecordBatchFileReader | AsyncRecordBatchFileReader; +/** @ignore */ type RecordBatchStreamReaders = RecordBatchStreamReader | AsyncRecordBatchStreamReader; + +export class RecordBatchReader extends ReadableInterop> { + + protected _impl: RecordBatchReaderImpls; + protected constructor(impl: RecordBatchReaderImpls) { + super(); + this._impl = impl; + } + + public get closed() { return this._impl.closed; } + public get schema() { return this._impl.schema; } + public get autoDestroy() { return this._impl.autoDestroy; } + public get dictionaries() { return this._impl.dictionaries; } + public get numDictionaries() { return this._impl.numDictionaries; } + public get numRecordBatches() { return this._impl.numRecordBatches; } + public get footer() { return this._impl.isFile() ? this._impl.footer : null; } + + public isSync(): this is RecordBatchReaders { return this._impl.isSync(); } + public isAsync(): this is AsyncRecordBatchReaders { return this._impl.isAsync(); } + public isFile(): this is RecordBatchFileReaders { return this._impl.isFile(); } + public isStream(): this is RecordBatchStreamReaders { return this._impl.isStream(); } + + public next() { + return this._impl.next(); + } + public throw(value?: any) { + return this._impl.throw(value); + } + public return(value?: any) { + return this._impl.return(value); + } + public cancel() { + return this._impl.cancel(); + } + public reset(schema?: Schema | null): this { + this._impl.reset(schema); + return this; + } + public open(options?: OpenOptions) { + const opening = this._impl.open(options); + return isPromise(opening) ? opening.then(() => this) : this; + } + public readRecordBatch(index: number): RecordBatch | null | Promise | null> { + return this._impl.isFile() ? this._impl.readRecordBatch(index) : null; + } + public [Symbol.iterator](): IterableIterator> { + return (>> this._impl)[Symbol.iterator](); + } + public [Symbol.asyncIterator](): AsyncIterableIterator> { + return (>> this._impl)[Symbol.asyncIterator](); + } + public toDOMStream() { + return streamAdapters.toDOMStream>( + (this.isSync() + ? { [Symbol.iterator]: () => this } as Iterable> + : { [Symbol.asyncIterator]: () => this } as AsyncIterable>)); + } + public toNodeStream() { + return streamAdapters.toNodeStream>( + (this.isSync() + ? { [Symbol.iterator]: () => this } as Iterable> + : { [Symbol.asyncIterator]: () => this } as AsyncIterable>), + { objectMode: true }); + } + + /** @nocollapse */ + // @ts-ignore + public static throughNode(options?: import('stream').DuplexOptions & { autoDestroy: boolean }): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + public static throughDOM( + // @ts-ignore + writableStrategy?: ByteLengthQueuingStrategy, + // @ts-ignore + readableStrategy?: { autoDestroy: boolean } + ): { writable: WritableStream, readable: ReadableStream> } { + throw new Error(`"throughDOM" not available in this environment`); + } + + public static from(source: T): T; + public static from(source: FromArg0): RecordBatchStreamReader; + public static from(source: FromArg1): Promise>; + public static from(source: FromArg2): RecordBatchFileReader | RecordBatchStreamReader; + public static from(source: FromArg3): Promise | RecordBatchStreamReader>; + public static from(source: FromArg4): Promise | AsyncRecordBatchReaders>; + public static from(source: FromArg5): Promise | AsyncRecordBatchStreamReader>; + /** @nocollapse */ + public static from(source: any) { + if (source instanceof RecordBatchReader) { + return source; + } else if (isArrowJSON(source)) { + return fromArrowJSON(source); + } else if (isFileHandle(source)) { + return fromFileHandle(source); + } else if (isPromise(source)) { + return (async () => await RecordBatchReader.from(await source))(); + } else if (isFetchResponse(source) || isReadableDOMStream(source) || isReadableNodeStream(source) || isAsyncIterable(source)) { + return fromAsyncByteStream(new AsyncByteStream(source)); + } + return fromByteStream(new ByteStream(source)); + } + + public static readAll(source: T): T extends RecordBatchReaders ? IterableIterator : AsyncIterableIterator; + public static readAll(source: FromArg0): IterableIterator>; + public static readAll(source: FromArg1): AsyncIterableIterator>; + public static readAll(source: FromArg2): IterableIterator | RecordBatchStreamReader>; + public static readAll(source: FromArg3): AsyncIterableIterator | RecordBatchStreamReader>; + public static readAll(source: FromArg4): AsyncIterableIterator | AsyncRecordBatchReaders>; + public static readAll(source: FromArg5): AsyncIterableIterator | AsyncRecordBatchStreamReader>; + /** @nocollapse */ + public static readAll(source: any) { + if (source instanceof RecordBatchReader) { + return source.isSync() ? readAllSync(source) : readAllAsync(source as AsyncRecordBatchReaders); + } else if (isArrowJSON(source) || ArrayBuffer.isView(source) || isIterable(source) || isIteratorResult(source)) { + return readAllSync(source) as IterableIterator>; + } + return readAllAsync(source) as AsyncIterableIterator | AsyncRecordBatchReaders>; + } +} + +// +// Since TS is a structural type system, we define the following subclass stubs +// so that concrete types exist to associate with with the interfaces below. +// +// The implementation for each RecordBatchReader is hidden away in the set of +// `RecordBatchReaderImpl` classes in the second half of this file. This allows +// us to export a single RecordBatchReader class, and swap out the impl based +// on the io primitives or underlying arrow (JSON, file, or stream) at runtime. +// +// Async/await makes our job a bit harder, since it forces everything to be +// either fully sync or fully async. This is why the logic for the reader impls +// has been duplicated into both sync and async variants. Since the RBR +// delegates to its impl, an RBR with an AsyncRecordBatchFileReaderImpl for +// example will return async/await-friendly Promises, but one with a (sync) +// RecordBatchStreamReaderImpl will always return values. Nothing should be +// different about their logic, aside from the async handling. This is also why +// this code looks highly structured, as it should be nearly identical and easy +// to follow. +// + +/** @ignore */ +export class RecordBatchStreamReader extends RecordBatchReader { + constructor(protected _impl: RecordBatchStreamReaderImpl) { super (_impl); } + public [Symbol.iterator]() { return (this._impl as IterableIterator>)[Symbol.iterator](); } + public async *[Symbol.asyncIterator](): AsyncIterableIterator> { yield* this[Symbol.iterator](); } +} +/** @ignore */ +export class AsyncRecordBatchStreamReader extends RecordBatchReader { + constructor(protected _impl: AsyncRecordBatchStreamReaderImpl) { super (_impl); } + public [Symbol.iterator](): IterableIterator> { throw new Error(`AsyncRecordBatchStreamReader is not Iterable`); } + public [Symbol.asyncIterator]() { return (this._impl as AsyncIterableIterator>)[Symbol.asyncIterator](); } +} +/** @ignore */ +export class RecordBatchFileReader extends RecordBatchStreamReader { + constructor(protected _impl: RecordBatchFileReaderImpl) { super (_impl); } +} +/** @ignore */ +export class AsyncRecordBatchFileReader extends AsyncRecordBatchStreamReader { + constructor(protected _impl: AsyncRecordBatchFileReaderImpl) { super (_impl); } +} + +// +// Now override the return types for each sync/async RecordBatchReader variant +// + +/** @ignore */ +export interface RecordBatchStreamReader extends RecordBatchReader { + open(options?: OpenOptions | undefined): this; + cancel(): void; + throw(value?: any): IteratorResult; + return(value?: any): IteratorResult; + next(value?: any): IteratorResult>; +} + +/** @ignore */ +export interface AsyncRecordBatchStreamReader extends RecordBatchReader { + open(options?: OpenOptions | undefined): Promise; + cancel(): Promise; + throw(value?: any): Promise>; + return(value?: any): Promise>; + next(value?: any): Promise>>; +} + +/** @ignore */ +export interface RecordBatchFileReader extends RecordBatchStreamReader { + footer: Footer; + readRecordBatch(index: number): RecordBatch | null; +} + +/** @ignore */ +export interface AsyncRecordBatchFileReader extends AsyncRecordBatchStreamReader { + footer: Footer; + readRecordBatch(index: number): Promise | null>; +} + +/** @ignore */ +type RecordBatchReaderImpls = + RecordBatchJSONReaderImpl | + RecordBatchFileReaderImpl | + RecordBatchStreamReaderImpl | + AsyncRecordBatchFileReaderImpl | + AsyncRecordBatchStreamReaderImpl; + +/** @ignore */ +interface RecordBatchReaderImpl { + + closed: boolean; + schema: Schema; + autoDestroy: boolean; + dictionaries: Map; + + isFile(): this is RecordBatchFileReaders; + isStream(): this is RecordBatchStreamReaders; + isSync(): this is RecordBatchReaders; + isAsync(): this is AsyncRecordBatchReaders; + + reset(schema?: Schema | null): this; +} + +/** @ignore */ +interface RecordBatchStreamReaderImpl extends RecordBatchReaderImpl { + + open(options?: OpenOptions): this; + cancel(): void; + + throw(value?: any): IteratorResult; + return(value?: any): IteratorResult; + next(value?: any): IteratorResult>; + + [Symbol.iterator](): IterableIterator>; +} + +/** @ignore */ +interface AsyncRecordBatchStreamReaderImpl extends RecordBatchReaderImpl { + + open(options?: OpenOptions): Promise; + cancel(): Promise; + + throw(value?: any): Promise>; + return(value?: any): Promise>; + next(value?: any): Promise>>; + + [Symbol.asyncIterator](): AsyncIterableIterator>; +} + +/** @ignore */ +interface RecordBatchFileReaderImpl extends RecordBatchStreamReaderImpl { + readRecordBatch(index: number): RecordBatch | null; +} + +/** @ignore */ +interface AsyncRecordBatchFileReaderImpl extends AsyncRecordBatchStreamReaderImpl { + readRecordBatch(index: number): Promise | null>; +} + +/** @ignore */ +abstract class RecordBatchReaderImpl implements RecordBatchReaderImpl { + + // @ts-ignore + public schema: Schema; + public closed = false; + public autoDestroy = true; + public dictionaries: Map; + + protected _dictionaryIndex = 0; + protected _recordBatchIndex = 0; + public get numDictionaries() { return this._dictionaryIndex; } + public get numRecordBatches() { return this._recordBatchIndex; } + + constructor(dictionaries = new Map()) { + this.dictionaries = dictionaries; + } + + public isSync(): this is RecordBatchReaders { return false; } + public isAsync(): this is AsyncRecordBatchReaders { return false; } + public isFile(): this is RecordBatchFileReaders { return false; } + public isStream(): this is RecordBatchStreamReaders { return false; } + + public reset(schema?: Schema | null) { + this._dictionaryIndex = 0; + this._recordBatchIndex = 0; + this.schema = schema; + this.dictionaries = new Map(); + return this; + } + + protected _loadRecordBatch(header: metadata.RecordBatch, body: any) { + return new RecordBatch(this.schema, header.length, this._loadVectors(header, body, this.schema.fields)); + } + protected _loadDictionaryBatch(header: metadata.DictionaryBatch, body: any) { + const { id, isDelta, data } = header; + const { dictionaries, schema } = this; + if (isDelta || !dictionaries.get(id)) { + + const type = schema.dictionaries.get(id)!; + const vector = (isDelta ? dictionaries.get(id)!.concat( + Vector.new(this._loadVectors(data, body, [type])[0])) : + Vector.new(this._loadVectors(data, body, [type])[0])) as Vector; + + (schema.dictionaryFields.get(id) || []).forEach(({ type }) => type.dictionaryVector = vector); + + return vector; + } + return dictionaries.get(id)!; + } + protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { + return new VectorLoader(body, header.nodes, header.buffers).visitMany(types); + } +} + +/** @ignore */ +class RecordBatchStreamReaderImpl extends RecordBatchReaderImpl implements IterableIterator> { + + protected _reader: MessageReader; + protected _handle: ByteStream | ArrowJSONLike; + + constructor(source: ByteStream | ArrowJSONLike, dictionaries?: Map) { + super(dictionaries); + this._reader = !isArrowJSON(source) + ? new MessageReader(this._handle = source) + : new JSONMessageReader(this._handle = source); + } + + public isSync(): this is RecordBatchReaders { return true; } + public isStream(): this is RecordBatchStreamReaders { return true; } + public [Symbol.iterator](): IterableIterator> { + return this as IterableIterator>; + } + public cancel() { + if (!this.closed && (this.closed = true)) { + this.reset()._reader.return(); + this._reader = null; + this.dictionaries = null; + } + } + public open(options?: OpenOptions) { + if (!this.closed) { + this.autoDestroy = shouldAutoDestroy(this, options); + if (!(this.schema || (this.schema = this._reader.readSchema()!))) { + this.cancel(); + } + } + return this; + } + public throw(value?: any): IteratorResult { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return this.reset()._reader.throw(value); + } + return ITERATOR_DONE; + } + public return(value?: any): IteratorResult { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return this.reset()._reader.return(value); + } + return ITERATOR_DONE; + } + public next(): IteratorResult> { + if (this.closed) { return ITERATOR_DONE; } + let message: Message | null, { _reader: reader } = this; + while (message = this._readNextMessageAndValidate()) { + if (message.isSchema()) { + this.reset(message.header()); + } else if (message.isRecordBatch()) { + this._recordBatchIndex++; + const header = message.header(); + const buffer = reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return { done: false, value: recordBatch }; + } else if (message.isDictionaryBatch()) { + this._dictionaryIndex++; + const header = message.header(); + const buffer = reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + return this.return(); + } + protected _readNextMessageAndValidate(type?: T | null) { + return this._reader.readMessage(type); + } +} + +/** @ignore */ +class AsyncRecordBatchStreamReaderImpl extends RecordBatchReaderImpl implements AsyncIterableIterator> { + + protected _handle: AsyncByteStream; + protected _reader: AsyncMessageReader; + + constructor(source: AsyncByteStream, dictionaries?: Map) { + super(dictionaries); + this._reader = new AsyncMessageReader(this._handle = source); + } + public isAsync(): this is AsyncRecordBatchReaders { return true; } + public isStream(): this is RecordBatchStreamReaders { return true; } + public [Symbol.asyncIterator](): AsyncIterableIterator> { + return this as AsyncIterableIterator>; + } + public async cancel() { + if (!this.closed && (this.closed = true)) { + await this.reset()._reader.return(); + this._reader = null; + this.dictionaries = null; + } + } + public async open(options?: OpenOptions) { + if (!this.closed) { + this.autoDestroy = shouldAutoDestroy(this, options); + if (!(this.schema || (this.schema = (await this._reader.readSchema())!))) { + await this.cancel(); + } + } + return this; + } + public async throw(value?: any): Promise> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return await this.reset()._reader.throw(value); + } + return ITERATOR_DONE; + } + public async return(value?: any): Promise> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return await this.reset()._reader.return(value); + } + return ITERATOR_DONE; + } + public async next() { + if (this.closed) { return ITERATOR_DONE; } + let message: Message | null, { _reader: reader } = this; + while (message = await this._readNextMessageAndValidate()) { + if (message.isSchema()) { + await this.reset(message.header()); + } else if (message.isRecordBatch()) { + this._recordBatchIndex++; + const header = message.header(); + const buffer = await reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return { done: false, value: recordBatch }; + } else if (message.isDictionaryBatch()) { + this._dictionaryIndex++; + const header = message.header(); + const buffer = await reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + return await this.return(); + } + protected async _readNextMessageAndValidate(type?: T | null) { + return await this._reader.readMessage(type); + } +} + +/** @ignore */ +class RecordBatchFileReaderImpl extends RecordBatchStreamReaderImpl { + + // @ts-ignore + protected _footer?: Footer; + // @ts-ignore + protected _handle: RandomAccessFile; + public get footer() { return this._footer!; } + public get numDictionaries() { return this._footer ? this._footer.numDictionaries : 0; } + public get numRecordBatches() { return this._footer ? this._footer.numRecordBatches : 0; } + + constructor(source: RandomAccessFile | ArrayBufferViewInput, dictionaries?: Map) { + super(source instanceof RandomAccessFile ? source : new RandomAccessFile(source), dictionaries); + } + public isSync(): this is RecordBatchReaders { return true; } + public isFile(): this is RecordBatchFileReaders { return true; } + public open(options?: OpenOptions) { + if (!this.closed && !this._footer) { + this.schema = (this._footer = this._readFooter()).schema; + for (const block of this._footer.dictionaryBatches()) { + block && this._readDictionaryBatch(this._dictionaryIndex++); + } + } + return super.open(options); + } + public readRecordBatch(index: number) { + if (this.closed) { return null; } + if (!this._footer) { this.open(); } + const block = this._footer && this._footer.getRecordBatch(index); + if (block && this._handle.seek(block.offset)) { + const message = this._reader.readMessage(MessageHeader.RecordBatch); + if (message && message.isRecordBatch()) { + const header = message.header(); + const buffer = this._reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return recordBatch; + } + } + return null; + } + protected _readDictionaryBatch(index: number) { + const block = this._footer && this._footer.getDictionaryBatch(index); + if (block && this._handle.seek(block.offset)) { + const message = this._reader.readMessage(MessageHeader.DictionaryBatch); + if (message && message.isDictionaryBatch()) { + const header = message.header(); + const buffer = this._reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + } + protected _readFooter() { + const { _handle } = this; + const offset = _handle.size - magicAndPadding; + const length = _handle.readInt32(offset); + const buffer = _handle.readAt(offset - length, length); + return Footer.decode(buffer); + } + protected _readNextMessageAndValidate(type?: T | null): Message | null { + if (!this._footer) { this.open(); } + if (this._footer && this._recordBatchIndex < this.numRecordBatches) { + const block = this._footer && this._footer.getRecordBatch(this._recordBatchIndex); + if (block && this._handle.seek(block.offset)) { + return this._reader.readMessage(type); + } + } + return null; + } +} + +/** @ignore */ +class AsyncRecordBatchFileReaderImpl extends AsyncRecordBatchStreamReaderImpl + implements AsyncRecordBatchFileReaderImpl { + + protected _footer?: Footer; + // @ts-ignore + protected _handle: AsyncRandomAccessFile; + public get footer() { return this._footer!; } + public get numDictionaries() { return this._footer ? this._footer.numDictionaries : 0; } + public get numRecordBatches() { return this._footer ? this._footer.numRecordBatches : 0; } + + constructor(source: FileHandle, byteLength?: number, dictionaries?: Map); + constructor(source: FileHandle | AsyncRandomAccessFile, dictionaries?: Map); + constructor(source: FileHandle | AsyncRandomAccessFile, ...rest: any[]) { + const byteLength = typeof rest[0] !== 'number' ? rest.shift() : undefined; + const dictionaries = rest[0] instanceof Map ? > rest.shift() : undefined; + super(source instanceof AsyncRandomAccessFile ? source : new AsyncRandomAccessFile(source, byteLength), dictionaries); + } + public isFile(): this is RecordBatchFileReaders { return true; } + public isAsync(): this is AsyncRecordBatchReaders { return true; } + public async open(options?: OpenOptions) { + if (!this.closed && !this._footer) { + this.schema = (this._footer = await this._readFooter()).schema; + for (const block of this._footer.dictionaryBatches()) { + block && await this._readDictionaryBatch(this._dictionaryIndex++); + } + } + return await super.open(options); + } + public async readRecordBatch(index: number) { + if (this.closed) { return null; } + if (!this._footer) { await this.open(); } + const block = this._footer && this._footer.getRecordBatch(index); + if (block && (await this._handle.seek(block.offset))) { + const message = await this._reader.readMessage(MessageHeader.RecordBatch); + if (message && message.isRecordBatch()) { + const header = message.header(); + const buffer = await this._reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return recordBatch; + } + } + return null; + } + protected async _readDictionaryBatch(index: number) { + const block = this._footer && this._footer.getDictionaryBatch(index); + if (block && (await this._handle.seek(block.offset))) { + const message = await this._reader.readMessage(MessageHeader.DictionaryBatch); + if (message && message.isDictionaryBatch()) { + const header = message.header(); + const buffer = await this._reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + } + protected async _readFooter() { + const { _handle } = this; + _handle._pending && await _handle._pending; + const offset = _handle.size - magicAndPadding; + const length = await _handle.readInt32(offset); + const buffer = await _handle.readAt(offset - length, length); + return Footer.decode(buffer); + } + protected async _readNextMessageAndValidate(type?: T | null): Promise | null> { + if (!this._footer) { await this.open(); } + if (this._footer && this._recordBatchIndex < this.numRecordBatches) { + const block = this._footer.getRecordBatch(this._recordBatchIndex); + if (block && await this._handle.seek(block.offset)) { + return await this._reader.readMessage(type); + } + } + return null; + } +} + +/** @ignore */ +class RecordBatchJSONReaderImpl extends RecordBatchStreamReaderImpl { + constructor(source: ArrowJSONLike, dictionaries?: Map) { + super(source, dictionaries); + } + protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { + return new JSONVectorLoader(body, header.nodes, header.buffers).visitMany(types); + } +} + +// +// Define some helper functions and static implementations down here. There's +// a bit of branching in the static methods that can lead to the same routines +// being executed, so we've broken those out here for readability. +// + +/** @ignore */ +function shouldAutoDestroy(self: { autoDestroy: boolean }, options?: OpenOptions) { + return options && (typeof options['autoDestroy'] === 'boolean') ? options['autoDestroy'] : self['autoDestroy']; +} + +/** @ignore */ +function* readAllSync(source: RecordBatchReaders | FromArg0 | FromArg2) { + const reader = RecordBatchReader.from( source) as RecordBatchReaders; + try { + if (!reader.open({ autoDestroy: false }).closed) { + do { yield reader; } while (!(reader.reset().open()).closed); + } + } finally { reader.cancel(); } +} + +/** @ignore */ +async function* readAllAsync(source: AsyncRecordBatchReaders | FromArg1 | FromArg3 | FromArg4 | FromArg5) { + const reader = await RecordBatchReader.from( source) as RecordBatchReader; + try { + if (!(await reader.open({ autoDestroy: false })).closed) { + do { yield reader; } while (!(await reader.reset().open()).closed); + } + } finally { await reader.cancel(); } +} + +/** @ignore */ +function fromArrowJSON(source: ArrowJSONLike) { + return new RecordBatchStreamReader(new RecordBatchJSONReaderImpl(source)); +} + +/** @ignore */ +function fromByteStream(source: ByteStream) { + const bytes = source.peek((magicLength + 7) & ~7); + return bytes && bytes.byteLength >= 4 ? !checkForMagicArrowString(bytes) + ? new RecordBatchStreamReader(new RecordBatchStreamReaderImpl(source)) + : new RecordBatchFileReader(new RecordBatchFileReaderImpl(source.read())) + : new RecordBatchStreamReader(new RecordBatchStreamReaderImpl(function*(): any {}())); +} + +/** @ignore */ +async function fromAsyncByteStream(source: AsyncByteStream) { + const bytes = await source.peek((magicLength + 7) & ~7); + return bytes && bytes.byteLength >= 4 ? !checkForMagicArrowString(bytes) + ? new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl(source)) + : new RecordBatchFileReader(new RecordBatchFileReaderImpl(await source.read())) + : new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl(async function*(): any {}())); +} + +/** @ignore */ +async function fromFileHandle(source: FileHandle) { + const { size } = await source.stat(); + const file = new AsyncRandomAccessFile(source, size); + if (size >= magicX2AndPadding) { + if (checkForMagicArrowString(await file.readAt(0, (magicLength + 7) & ~7))) { + return new AsyncRecordBatchFileReader(new AsyncRecordBatchFileReaderImpl(file)); + } + } + return new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl(file)); +} diff --git a/js/src/ipc/reader/arrow.ts b/js/src/ipc/reader/arrow.ts deleted file mode 100644 index 1847c9c2eb628..0000000000000 --- a/js/src/ipc/reader/arrow.ts +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { readJSON } from './json'; -import { fromReadableStream } from './node'; -import { RecordBatch } from '../../recordbatch'; -import { readBuffers, readBuffersAsync } from './binary'; -import { readRecordBatches, readRecordBatchesAsync, TypeDataLoader } from './vector'; -import { Schema } from '../../type'; -import { Message } from '../metadata'; - -export { readJSON, RecordBatch }; -export { readBuffers, readBuffersAsync }; -export { readRecordBatches, readRecordBatchesAsync }; - -export function* read(sources: Iterable | object | string) { - let input: any = sources; - let messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>; - if (typeof input === 'string') { - try { input = JSON.parse(input); } - catch (e) { input = sources; } - } - if (!input || typeof input !== 'object') { - messages = (typeof input === 'string') ? readBuffers([input]) : []; - } else { - messages = (typeof input[Symbol.iterator] === 'function') ? readBuffers(input) : readJSON(input); - } - yield* readRecordBatches(messages); -} - -export async function* readAsync(sources: AsyncIterable) { - for await (let recordBatch of readRecordBatchesAsync(readBuffersAsync(sources))) { - yield recordBatch; - } -} - -export async function* readStream(stream: NodeJS.ReadableStream) { - for await (const recordBatch of readAsync(fromReadableStream(stream))) { - yield recordBatch as RecordBatch; - } -} diff --git a/js/src/ipc/reader/binary.ts b/js/src/ipc/reader/binary.ts deleted file mode 100644 index 988ce606b2614..0000000000000 --- a/js/src/ipc/reader/binary.ts +++ /dev/null @@ -1,432 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from '../../vector'; -import { flatbuffers } from 'flatbuffers'; -import { TypeDataLoader } from './vector'; -import { checkForMagicArrowString, PADDING, magicAndPadding, isValidArrowFile } from '../magic'; -import { Message, Footer, FileBlock, RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata, } from '../metadata'; -import { - Schema, Field, - DataType, Dictionary, - Null, TimeBitWidth, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from '../../type'; - -import { - Int8, Uint8, - Int16, Uint16, - Int32, Uint32, - Int64, Uint64, - Float16, Float64, Float32, -} from '../../type'; - -import ByteBuffer = flatbuffers.ByteBuffer; - -type MessageReader = (bb: ByteBuffer) => IterableIterator; - -export function* readBuffers(sources: Iterable | Uint8Array | Buffer | string) { - let schema: Schema | null = null; - let dictionaries = new Map(); - let readMessages: MessageReader | null = null; - if (ArrayBuffer.isView(sources) || typeof sources === 'string') { - sources = [sources as T]; - } - for (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb)) || true) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, - loader: new BinaryDataLoader( - bb, - arrayIterator((message as any).nodes || []), - arrayIterator((message as any).buffers || []), - dictionaries - ) - }; - } - } - } -} - -export async function* readBuffersAsync(sources: AsyncIterable) { - let schema: Schema | null = null; - let dictionaries = new Map(); - let readMessages: MessageReader | null = null; - for await (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb)) || true) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, - loader: new BinaryDataLoader( - bb, - arrayIterator((message as any).nodes || []), - arrayIterator((message as any).buffers || []), - dictionaries - ) - }; - } - } - } -} - -export class BinaryDataLoader extends TypeDataLoader { - private bytes: Uint8Array; - private messageOffset: number; - constructor(bb: ByteBuffer, nodes: Iterator, buffers: Iterator, dictionaries: Map) { - super(nodes, buffers, dictionaries); - this.bytes = bb.bytes(); - this.messageOffset = bb.position(); - } - protected readOffsets(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } - protected readTypeIds(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } - protected readData(_type: T, { length, offset }: BufferMetadata = this.getBufferMetadata()) { - return new Uint8Array(this.bytes.buffer, this.bytes.byteOffset + this.messageOffset + offset, length); - } -} - -function* arrayIterator(arr: Array) { yield* arr; } - -function toByteBuffer(bytes?: Uint8Array | Buffer | string) { - let arr: Uint8Array = bytes as any || new Uint8Array(0); - if (typeof bytes === 'string') { - arr = new Uint8Array(bytes.length); - for (let i = -1, n = bytes.length; ++i < n;) { - arr[i] = bytes.charCodeAt(i); - } - return new ByteBuffer(arr); - } - return new ByteBuffer(arr); -} - -function readSchema(bb: ByteBuffer) { - let schema: Schema, readMessages, footer: Footer | null; - if (footer = readFileSchema(bb)) { - schema = footer.schema; - readMessages = readFileMessages(footer); - } else if (schema = readStreamSchema(bb)!) { - readMessages = readStreamMessages; - } else { - throw new Error('Invalid Arrow buffer'); - } - return { schema, readMessages }; -} - -function readStreamSchema(bb: ByteBuffer) { - if (!checkForMagicArrowString(bb.bytes(), 0)) { - for (const message of readMessages(bb)) { - if (Message.isSchema(message)) { - return message as Schema; - } - } - } - return null; -} - -function* readStreamMessages(bb: ByteBuffer) { - for (const message of readMessages(bb)) { - if (Message.isRecordBatch(message)) { - yield message; - } else if (Message.isDictionaryBatch(message)) { - yield message; - } else { - yield message; - } - // position the buffer after the body to read the next message - bb.setPosition(bb.position() + message.bodyLength); - } -} - -function readFileSchema(bb: ByteBuffer) { - if (!isValidArrowFile(bb)) { - return null; - } - let fileLength = bb.capacity(); - let lengthOffset = fileLength - magicAndPadding; - let footerLength = bb.readInt32(lengthOffset); - bb.setPosition(lengthOffset - footerLength); - return footerFromByteBuffer(bb); -} - -function readFileMessages(footer: Footer) { - return function* (bb: ByteBuffer) { - let message: RecordBatchMetadata | DictionaryBatch; - for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset); - if (message = readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch) { - yield message; - } - } - for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset); - if (message = readMessage(bb, bb.readInt32(bb.position())) as RecordBatchMetadata) { - yield message; - } - } - }; -} - -function* readMessages(bb: ByteBuffer) { - let length: number, message: Schema | RecordBatchMetadata | DictionaryBatch; - while (bb.position() < bb.capacity() && - (length = bb.readInt32(bb.position())) > 0) { - if (message = readMessage(bb, length)!) { - yield message; - } - } -} - -function readMessage(bb: ByteBuffer, length: number) { - bb.setPosition(bb.position() + PADDING); - const message = messageFromByteBuffer(bb); - bb.setPosition(bb.position() + length); - return message; -} - -import * as File_ from '../../fb/File'; -import * as Schema_ from '../../fb/Schema'; -import * as Message_ from '../../fb/Message'; - -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import _Footer = File_.org.apache.arrow.flatbuf.Footer; -import _Block = File_.org.apache.arrow.flatbuf.Block; -import _Message = Message_.org.apache.arrow.flatbuf.Message; -import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; -import _Field = Schema_.org.apache.arrow.flatbuf.Field; -import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; -import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; -import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; -import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; -import _Null = Schema_.org.apache.arrow.flatbuf.Null; -import _Int = Schema_.org.apache.arrow.flatbuf.Int; -import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; -import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; -import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; -import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; -import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; -import _Date = Schema_.org.apache.arrow.flatbuf.Date; -import _Time = Schema_.org.apache.arrow.flatbuf.Time; -import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; -import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; -import _List = Schema_.org.apache.arrow.flatbuf.List; -import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; -import _Union = Schema_.org.apache.arrow.flatbuf.Union; -import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; -import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; -import _Map = Schema_.org.apache.arrow.flatbuf.Map; - -function footerFromByteBuffer(bb: ByteBuffer) { - const dictionaryFields = new Map>(); - const f = _Footer.getRootAsFooter(bb), s = f.schema()!; - return new Footer( - dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), - new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), f.version(), dictionaryFields) - ); -} - -function messageFromByteBuffer(bb: ByteBuffer) { - const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); - switch (type) { - case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!, new Map()); - case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m, m.header(new _RecordBatch())!); - case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m, m.header(new _DictionaryBatch())!); - } - return null; - // throw new Error(`Unrecognized Message type '${type}'`); -} - -function schemaFromMessage(version: MetadataVersion, s: _Schema, dictionaryFields: Map>) { - return new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), version, dictionaryFields); -} - -function recordBatchFromMessage(version: MetadataVersion, m: _Message, b: _RecordBatch) { - return new RecordBatchMetadata(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version), m.bodyLength()); -} - -function dictionaryBatchFromMessage(version: MetadataVersion, m: _Message, d: _DictionaryBatch) { - return new DictionaryBatch(version, recordBatchFromMessage(version, m, d.data()!), d.id(), d.isDelta()); -} - -function dictionaryBatchesFromFooter(f: _Footer) { - const blocks = [] as FileBlock[]; - for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { - if (b = f.dictionaries(i)!) { - blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function recordBatchesFromFooter(f: _Footer) { - const blocks = [] as FileBlock[]; - for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { - if (b = f.recordBatches(i)!) { - blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function fieldsFromSchema(s: _Schema, dictionaryFields: Map> | null) { - const fields = [] as Field[]; - for (let i = -1, c: Field | null, n = s && s.fieldsLength(); ++i < n;) { - if (c = field(s.fields(i)!, dictionaryFields)) { - fields.push(c); - } - } - return fields; -} - -function fieldsFromField(f: _Field, dictionaryFields: Map> | null) { - const fields = [] as Field[]; - for (let i = -1, c: Field | null, n = f && f.childrenLength(); ++i < n;) { - if (c = field(f.children(i)!, dictionaryFields)) { - fields.push(c); - } - } - return fields; -} - -function fieldNodesFromRecordBatch(b: _RecordBatch) { - const fieldNodes = [] as FieldMetadata[]; - for (let i = -1, n = b.nodesLength(); ++i < n;) { - fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); - } - return fieldNodes; -} - -function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { - const buffers = [] as BufferMetadata[]; - for (let i = -1, n = b.buffersLength(); ++i < n;) { - let buffer = b.buffers(i)!; - // If this Arrow buffer was written before version 4, - // advance the buffer's bb_pos 8 bytes to skip past - // the now-removed page id field. - if (version < MetadataVersion.V4) { - buffer.bb_pos += (8 * (i + 1)); - } - buffers.push(bufferFromRecordBatch(buffer)); - } - return buffers; -} - -function field(f: _Field, dictionaryFields: Map> | null) { - let name = f.name()!; - let field: Field | void; - let nullable = f.nullable(); - let metadata = customMetadata(f); - let dataType: DataType | null; - let keysMeta: _Int | null, id: number; - let dictMeta: _DictionaryEncoding | null; - if (!dictionaryFields || !(dictMeta = f.dictionary())) { - if (dataType = typeFromField(f, fieldsFromField(f, dictionaryFields))) { - field = new Field(name, dataType, nullable, metadata); - } - } else if (dataType = dictionaryFields.has(id = dictMeta.id().low) - ? dictionaryFields.get(id)!.type.dictionary - : typeFromField(f, fieldsFromField(f, null))) { - dataType = new Dictionary(dataType, - // a dictionary index defaults to signed 32 bit int if unspecified - (keysMeta = dictMeta.indexType()) ? intFromField(keysMeta)! : new Int32(), - id, dictMeta.isOrdered() - ); - field = new Field(name, dataType, nullable, metadata); - dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); - } - return field || null; -} - -function customMetadata(parent?: _Schema | _Field | null) { - const data = new Map(); - if (parent) { - for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { - if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { - data.set(key, entry.value()!); - } - } - } - return data; -} - -function fieldNodeFromRecordBatch(f: _FieldNode) { - return new FieldMetadata(f.length(), f.nullCount()); -} - -function bufferFromRecordBatch(b: _Buffer) { - return new BufferMetadata(b.offset(), b.length()); -} - -function typeFromField(f: _Field, children?: Field[]): DataType | null { - switch (f.typeType()) { - case Type.NONE: return null; - case Type.Null: return nullFromField(f.type(new _Null())!); - case Type.Int: return intFromField(f.type(new _Int())!); - case Type.FloatingPoint: return floatFromField(f.type(new _FloatingPoint())!); - case Type.Binary: return binaryFromField(f.type(new _Binary())!); - case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); - case Type.Bool: return boolFromField(f.type(new _Bool())!); - case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); - case Type.Date: return dateFromField(f.type(new _Date())!); - case Type.Time: return timeFromField(f.type(new _Time())!); - case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); - case Type.Interval: return intervalFromField(f.type(new _Interval())!); - case Type.List: return listFromField(f.type(new _List())!, children || []); - case Type.Struct_: return structFromField(f.type(new _Struct())!, children || []); - case Type.Union: return unionFromField(f.type(new _Union())!, children || []); - case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); - case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!, children || []); - case Type.Map: return mapFromField(f.type(new _Map())!, children || []); - } - throw new Error(`Unrecognized type ${f.typeType()}`); -} - -function nullFromField (_type: _Null) { return new Null(); } -function intFromField (_type: _Int) { switch (_type.bitWidth()) { - case 8: return _type.isSigned() ? new Int8() : new Uint8(); - case 16: return _type.isSigned() ? new Int16() : new Uint16(); - case 32: return _type.isSigned() ? new Int32() : new Uint32(); - case 64: return _type.isSigned() ? new Int64() : new Uint64(); - } - return null; } -function floatFromField (_type: _FloatingPoint) { switch (_type.precision()) { - case Precision.HALF: return new Float16(); - case Precision.SINGLE: return new Float32(); - case Precision.DOUBLE: return new Float64(); - } - return null; } -function binaryFromField (_type: _Binary) { return new Binary(); } -function utf8FromField (_type: _Utf8) { return new Utf8(); } -function boolFromField (_type: _Bool) { return new Bool(); } -function decimalFromField (_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } -function dateFromField (_type: _Date) { return new Date_(_type.unit()); } -function timeFromField (_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } -function timestampFromField (_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } -function intervalFromField (_type: _Interval) { return new Interval(_type.unit()); } -function listFromField (_type: _List, children: Field[]) { return new List(children); } -function structFromField (_type: _Struct, children: Field[]) { return new Struct(children); } -function unionFromField (_type: _Union, children: Field[]) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[], children); } -function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } -function fixedSizeListFromField (_type: _FixedSizeList, children: Field[]) { return new FixedSizeList(_type.listSize(), children); } -function mapFromField (_type: _Map, children: Field[]) { return new Map_(_type.keysSorted(), children); } diff --git a/js/src/ipc/reader/json.ts b/js/src/ipc/reader/json.ts deleted file mode 100644 index 0f0c018d66bb9..0000000000000 --- a/js/src/ipc/reader/json.ts +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from '../../vector'; -import { flatbuffers } from 'flatbuffers'; -import { TypeDataLoader } from './vector'; -import { packBools } from '../../util/bit'; -import * as IntUtil from '../../util/int'; -import { TextEncoder } from 'text-encoding-utf-8'; -import { RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata } from '../metadata'; -import { - Schema, Field, - DataType, Dictionary, - Null, TimeBitWidth, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from '../../type'; - -import { - Int8, Uint8, - Int16, Uint16, - Int32, Uint32, - Int64, Uint64, - Float16, Float64, Float32, -} from '../../type'; - -import Long = flatbuffers.Long; - -export function* readJSON(json: any) { - const schema = schemaFromJSON(json['schema']); - const dictionaries = new Map(); - for (const batch of (json['dictionaries'] || [])) { - const message = dictionaryBatchFromJSON(batch); - yield { - schema, message, - loader: new JSONDataLoader( - flattenDataSources(batch['data']['columns']), - arrayIterator(message.nodes), - arrayIterator(message.buffers), - dictionaries - ) - }; - } - for (const batch of (json['batches'] || [])) { - const message = recordBatchFromJSON(batch); - yield { - schema, message, - loader: new JSONDataLoader( - flattenDataSources(batch['columns']), - arrayIterator(message.nodes), - arrayIterator(message.buffers), - dictionaries - ) - }; - } -} - -function* arrayIterator(arr: Array) { yield* arr; } -function flattenDataSources(xs: any[]): any[][] { - return (xs || []).reduce((buffers, column: any) => [ - ...buffers, - ...(column['VALIDITY'] && [column['VALIDITY']] || []), - ...(column['OFFSET'] && [column['OFFSET']] || []), - ...(column['TYPE'] && [column['TYPE']] || []), - ...(column['DATA'] && [column['DATA']] || []), - ...flattenDataSources(column['children']) - ], [] as any[][]); -} - -const utf8Encoder = new TextEncoder('utf-8'); - -export class JSONDataLoader extends TypeDataLoader { - constructor(private sources: any[][], nodes: Iterator, buffers: Iterator, dictionaries: Map) { - super(nodes, buffers, dictionaries); - } - protected readNullBitmap(_type: T, nullCount: number, { offset } = this.getBufferMetadata()) { - return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); - } - protected readOffsets(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { - return new Int32Array(this.sources[offset]); - } - protected readTypeIds(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { - return new Int8Array(this.sources[offset]); - } - protected readData(type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { - const { sources } = this; - if (DataType.isTimestamp(type) === true) { - return new Uint8Array(IntUtil.Int64.convertArray(sources[offset] as string[]).buffer); - } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { - return new Uint8Array(IntUtil.Int64.convertArray(sources[offset] as string[]).buffer); - } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { - return new Uint8Array(IntUtil.Int64.convertArray(sources[offset] as string[]).buffer); - } else if (DataType.isDecimal(type) === true) { - return new Uint8Array(IntUtil.Int128.convertArray(sources[offset] as string[]).buffer); - } else if (DataType.isBinary(type) === true || DataType.isFixedSizeBinary(type) === true) { - return new Uint8Array(binaryDataFromJSON(sources[offset] as string[])); - } else if (DataType.isBool(type) === true) { - return new Uint8Array(packBools(sources[offset] as number[]).buffer); - } else if (DataType.isUtf8(type) === true) { - return utf8Encoder.encode((sources[offset] as string[]).join('')); - } else { - return toTypedArray(type.ArrayType, sources[offset].map((x) => +x)) as any; - } - } -} - -function binaryDataFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = parseInt(joined.substr(i, 2), 16); - } - return data.buffer; -} - -import * as Schema_ from '../../fb/Schema'; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import { toTypedArray } from '../../data'; - -function schemaFromJSON(s: any): Schema { - const dictionaryFields = new Map>(); - return new Schema( - fieldsFromJSON(s['fields'], dictionaryFields), - customMetadata(s['customMetadata']), - MetadataVersion.V4, dictionaryFields - ); -} - -function recordBatchFromJSON(b: any): RecordBatchMetadata { - return new RecordBatchMetadata( - MetadataVersion.V4, - b['count'], - fieldNodesFromJSON(b['columns']), - buffersFromJSON(b['columns']) - ); -} - -function dictionaryBatchFromJSON(b: any): DictionaryBatch { - return new DictionaryBatch( - MetadataVersion.V4, - recordBatchFromJSON(b['data']), - b['id'], b['isDelta'] - ); -} - -function fieldsFromJSON(fs: any[], dictionaryFields: Map> | null): Field[] { - return (fs || []) - .map((f) => fieldFromJSON(f, dictionaryFields)) - .filter((f) => f != null) as Field[]; -} - -function fieldNodesFromJSON(xs: any[]): FieldMetadata[] { - return (xs || []).reduce((fieldNodes, column: any) => [ - ...fieldNodes, - new FieldMetadata( - new Long(column['count'], 0), - new Long(nullCountFromJSON(column['VALIDITY']), 0) - ), - ...fieldNodesFromJSON(column['children']) - ], [] as FieldMetadata[]); -} - -function buffersFromJSON(xs: any[], buffers: BufferMetadata[] = []): BufferMetadata[] { - for (let i = -1, n = (xs || []).length; ++i < n;) { - const column = xs[i]; - column['VALIDITY'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); - column['OFFSET'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); - column['TYPE'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['TYPE'].length, 0))); - column['DATA'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); - buffers = buffersFromJSON(column['children'], buffers); - } - return buffers; -} - -function nullCountFromJSON(validity: number[]) { - return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); -} - -function fieldFromJSON(f: any, dictionaryFields: Map> | null) { - let name = f['name']; - let field: Field | void; - let nullable = f['nullable']; - let dataType: DataType | null; - let id: number, keysMeta: any, dictMeta: any; - let metadata = customMetadata(f['customMetadata']); - if (!dictionaryFields || !(dictMeta = f['dictionary'])) { - if (dataType = typeFromJSON(f['type'], fieldsFromJSON(f['children'], dictionaryFields))) { - field = new Field(name, dataType, nullable, metadata); - } - } else if (dataType = dictionaryFields.has(id = dictMeta['id']) - ? dictionaryFields.get(id)!.type.dictionary - : typeFromJSON(f['type'], fieldsFromJSON(f['children'], null))) { - dataType = new Dictionary(dataType, - // a dictionary index defaults to signed 32 bit int if unspecified - (keysMeta = dictMeta['indexType']) ? intFromJSON(keysMeta)! : new Int32(), - id, dictMeta['isOrdered'] - ); - field = new Field(name, dataType, nullable, metadata); - dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); - } - return field || null; -} - -function customMetadata(metadata?: any) { - return new Map(Object.entries(metadata || {})); -} - -const namesToTypeMap: { [n: string]: Type } = { - 'NONE': Type.NONE, - 'null': Type.Null, - 'int': Type.Int, - 'floatingpoint': Type.FloatingPoint, - 'binary': Type.Binary, - 'bool': Type.Bool, - 'utf8': Type.Utf8, - 'decimal': Type.Decimal, - 'date': Type.Date, - 'time': Type.Time, - 'timestamp': Type.Timestamp, - 'interval': Type.Interval, - 'list': Type.List, - 'struct': Type.Struct_, - 'union': Type.Union, - 'fixedsizebinary': Type.FixedSizeBinary, - 'fixedsizelist': Type.FixedSizeList, - 'map': Type.Map, -}; - -function typeFromJSON(t: any, children?: Field[]) { - switch (namesToTypeMap[t['name']]) { - case Type.NONE: return null; - case Type.Null: return nullFromJSON(t); - case Type.Int: return intFromJSON(t); - case Type.FloatingPoint: return floatingPointFromJSON(t); - case Type.Binary: return binaryFromJSON(t); - case Type.Utf8: return utf8FromJSON(t); - case Type.Bool: return boolFromJSON(t); - case Type.Decimal: return decimalFromJSON(t); - case Type.Date: return dateFromJSON(t); - case Type.Time: return timeFromJSON(t); - case Type.Timestamp: return timestampFromJSON(t); - case Type.Interval: return intervalFromJSON(t); - case Type.List: return listFromJSON(t, children || []); - case Type.Struct_: return structFromJSON(t, children || []); - case Type.Union: return unionFromJSON(t, children || []); - case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); - case Type.FixedSizeList: return fixedSizeListFromJSON(t, children || []); - case Type.Map: return mapFromJSON(t, children || []); - } - throw new Error(`Unrecognized type ${t['name']}`); -} - -function nullFromJSON (_type: any) { return new Null(); } -function intFromJSON (_type: any) { switch (_type['bitWidth']) { - case 8: return _type['isSigned'] ? new Int8() : new Uint8(); - case 16: return _type['isSigned'] ? new Int16() : new Uint16(); - case 32: return _type['isSigned'] ? new Int32() : new Uint32(); - case 64: return _type['isSigned'] ? new Int64() : new Uint64(); - } - return null; } -function floatingPointFromJSON (_type: any) { switch (Precision[_type['precision']] as any) { - case Precision.HALF: return new Float16(); - case Precision.SINGLE: return new Float32(); - case Precision.DOUBLE: return new Float64(); - } - return null; } -function binaryFromJSON (_type: any) { return new Binary(); } -function utf8FromJSON (_type: any) { return new Utf8(); } -function boolFromJSON (_type: any) { return new Bool(); } -function decimalFromJSON (_type: any) { return new Decimal(_type['scale'], _type['precision']); } -function dateFromJSON (_type: any) { return new Date_(DateUnit[_type['unit']] as any); } -function timeFromJSON (_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } -function timestampFromJSON (_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } -function intervalFromJSON (_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } -function listFromJSON (_type: any, children: Field[]) { return new List(children); } -function structFromJSON (_type: any, children: Field[]) { return new Struct(children); } -function unionFromJSON (_type: any, children: Field[]) { return new Union(UnionMode[_type['mode']] as any, (_type['typeIds'] || []) as Type[], children); } -function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } -function fixedSizeListFromJSON (_type: any, children: Field[]) { return new FixedSizeList(_type['listSize'], children); } -function mapFromJSON (_type: any, children: Field[]) { return new Map_(_type['keysSorted'], children); } diff --git a/js/src/ipc/reader/node.ts b/js/src/ipc/reader/node.ts deleted file mode 100644 index 24295c81cbd52..0000000000000 --- a/js/src/ipc/reader/node.ts +++ /dev/null @@ -1,78 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import * as Message_ from '../../fb/Message'; -import ByteBuffer = flatbuffers.ByteBuffer; -import _Message = Message_.org.apache.arrow.flatbuf.Message; -import { PADDING, isValidArrowFile, checkForMagicArrowString } from '../magic'; - -export async function* fromReadableStream(stream: NodeJS.ReadableStream) { - - let bb: ByteBuffer; - let bytesRead = 0, bytes = new Uint8Array(0); - let messageLength = 0, message: _Message | null = null; - - for await (let chunk of (stream as any as AsyncIterable)) { - - if (chunk == null) { - continue; - } - - const grown = new Uint8Array(bytes.byteLength + chunk.length); - - if (typeof chunk !== 'string') { - grown.set(bytes, 0) || grown.set(chunk, bytes.byteLength); - } else { - for (let i = -1, j = bytes.byteLength, n = chunk.length; ++i < n;) { - grown[i + j] = chunk.charCodeAt(i); - } - } - - bytes = grown; - - // If we're reading in an Arrow File, just concatenate the bytes until - // the file is fully read in - if (checkForMagicArrowString(bytes)) { - if (!isValidArrowFile(new ByteBuffer(bytes))) { - continue; - } - return yield bytes; - } - - if (bytes.byteLength > 0 && messageLength <= 0) { - messageLength = new DataView(bytes.buffer).getInt32(0, true); - } - - while (messageLength > 0 && messageLength <= bytes.byteLength) { - if (!message) { - (bb = new ByteBuffer(bytes)).setPosition(4); - if (message = _Message.getRootAsMessage(bb)) { - messageLength += message.bodyLength().low; - continue; - } - throw new Error(`Invalid message at position ${bytesRead}`); - } - bytesRead += messageLength + PADDING; - yield bytes.subarray(0, messageLength + PADDING); - bytes = bytes.subarray(messageLength + PADDING); - messageLength = bytes.byteLength < 4 ? 0 : - new DataView(bytes.buffer).getInt32(bytes.byteOffset, true); - message = null; - } - } -} diff --git a/js/src/ipc/reader/vector.ts b/js/src/ipc/reader/vector.ts deleted file mode 100644 index c4688f5e2b851..0000000000000 --- a/js/src/ipc/reader/vector.ts +++ /dev/null @@ -1,131 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from '../../vector'; -import { RecordBatch } from '../../recordbatch'; -import { TypeVisitor } from '../../visitor'; -import { FlatType, NestedType, ListType } from '../../type'; -import { Message, FieldMetadata, BufferMetadata } from '../metadata'; -import { FlatData, ListData, NestedData, SingleNestedData, DenseUnionData, SparseUnionData, BoolData, FlatListData, DictionaryData } from '../../data'; -import { - Schema, Field, - Dictionary, - Null, Int, Float, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, - UnionMode, SparseUnion, DenseUnion, FlatListType, DataType, -} from '../../type'; - -export function* readRecordBatches(messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { - for (const { schema, message, loader } of messages) { - yield* readRecordBatch(schema, message, loader); - } -} - -export async function* readRecordBatchesAsync(messages: AsyncIterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { - for await (const { schema, message, loader } of messages) { - yield* readRecordBatch(schema, message, loader); - } -} - -export function* readRecordBatch(schema: Schema, message: Message, loader: TypeDataLoader) { - if (Message.isRecordBatch(message)) { - yield new RecordBatch(schema, message.length, loader.visitFields(schema.fields)); - } else if (Message.isDictionaryBatch(message)) { - const dictionaryId = message.id; - const dictionaries = loader.dictionaries; - const dictionaryField = schema.dictionaries.get(dictionaryId)!; - const dictionaryDataType = (dictionaryField.type as Dictionary).dictionary; - let dictionaryVector = Vector.create(loader.visit(dictionaryDataType)); - if (message.isDelta && dictionaries.has(dictionaryId)) { - dictionaryVector = dictionaries.get(dictionaryId)!.concat(dictionaryVector); - } - dictionaries.set(dictionaryId, dictionaryVector); - } -} - -export abstract class TypeDataLoader extends TypeVisitor { - - public dictionaries: Map; - protected nodes: Iterator; - protected buffers: Iterator; - - constructor(nodes: Iterator, buffers: Iterator, dictionaries: Map) { - super(); - this.nodes = nodes; - this.buffers = buffers; - this.dictionaries = dictionaries; - } - - public visitFields(fields: Field[]) { return fields.map((field) => this.visit(field.type)); } - - public visitNull (type: Null) { return this.visitNullType(type); } - public visitInt (type: Int) { return this.visitFlatType(type); } - public visitFloat (type: Float) { return this.visitFlatType(type); } - public visitBinary (type: Binary) { return this.visitFlatList(type); } - public visitUtf8 (type: Utf8) { return this.visitFlatList(type); } - public visitBool (type: Bool) { return this.visitBoolType(type); } - public visitDecimal (type: Decimal) { return this.visitFlatType(type); } - public visitDate (type: Date_) { return this.visitFlatType(type); } - public visitTime (type: Time) { return this.visitFlatType(type); } - public visitTimestamp (type: Timestamp) { return this.visitFlatType(type); } - public visitInterval (type: Interval) { return this.visitFlatType(type); } - public visitList (type: List) { return this.visitListType(type); } - public visitStruct (type: Struct) { return this.visitNestedType(type); } - public visitUnion (type: Union) { return this.visitUnionType(type); } - public visitFixedSizeBinary(type: FixedSizeBinary) { return this.visitFlatType(type); } - public visitFixedSizeList (type: FixedSizeList) { return this.visitFixedSizeListType(type); } - public visitMap (type: Map_) { return this.visitNestedType(type); } - public visitDictionary (type: Dictionary) { - return new DictionaryData(type, this.dictionaries.get(type.id)!, this.visit(type.indices)); - } - protected getFieldMetadata() { return this.nodes.next().value; } - protected getBufferMetadata() { return this.buffers.next().value; } - protected readNullBitmap(type: T, nullCount: number, buffer = this.getBufferMetadata()) { - return nullCount > 0 && this.readData(type, buffer) || new Uint8Array(0); - } - protected abstract readData(type: T, buffer?: BufferMetadata): any; - protected abstract readOffsets(type: T, buffer?: BufferMetadata): any; - protected abstract readTypeIds(type: T, buffer?: BufferMetadata): any; - protected visitNullType(type: Null, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new FlatData(type, length, this.readNullBitmap(type, nullCount), new Uint8Array(0), 0, nullCount); - } - protected visitFlatType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new FlatData(type, length, this.readNullBitmap(type, nullCount), this.readData(type), 0, nullCount); - } - protected visitBoolType(type: Bool, { length, nullCount }: FieldMetadata = this.getFieldMetadata(), data?: Uint8Array) { - return new BoolData(type, length, this.readNullBitmap(type, nullCount), data || this.readData(type), 0, nullCount); - } - protected visitFlatList(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new FlatListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type), 0, nullCount); - } - protected visitListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new ListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children![0].type), 0, nullCount); - } - protected visitFixedSizeListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new SingleNestedData(type, length, this.readNullBitmap(type, nullCount), this.visit(type.children![0].type), 0, nullCount); - } - protected visitNestedType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new NestedData(type, length, this.readNullBitmap(type, nullCount), this.visitFields(type.children), 0, nullCount); - } - protected visitUnionType(type: DenseUnion | SparseUnion, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return type.mode === UnionMode.Sparse ? - new SparseUnionData(type as SparseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount) : - new DenseUnionData(type as DenseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.readOffsets(type), this.visitFields(type.children), 0, nullCount); - } -} diff --git a/js/src/ipc/whatwg/iterable.ts b/js/src/ipc/whatwg/iterable.ts new file mode 100644 index 0000000000000..31916f2a3bdac --- /dev/null +++ b/js/src/ipc/whatwg/iterable.ts @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { toUint8Array } from '../../util/buffer'; +import { ReadableDOMStreamOptions } from '../../io/interfaces'; +import { isIterable, isAsyncIterable } from '../../util/compat'; + +/** @ignore */ +export function toDOMStream(source: Iterable | AsyncIterable, options?: ReadableDOMStreamOptions): ReadableStream { + if (isAsyncIterable(source)) { return asyncIterableAsReadableDOMStream(source, options); } + if (isIterable(source)) { return iterableAsReadableDOMStream(source, options); } + /* istanbul ignore next */ + throw new Error(`toDOMStream() must be called with an Iterable or AsyncIterable`); +} + +/** @ignore */ +function iterableAsReadableDOMStream(source: Iterable, options?: ReadableDOMStreamOptions) { + + let it: Iterator | null = null; + const bm = (options && options.type === 'bytes') || false; + const hwm = options && options.highWaterMark || (2 ** 24); + + return new ReadableStream({ + ...options as any, + start(controller) { next(controller, it || (it = source[Symbol.iterator]())); }, + pull(controller) { it ? (next(controller, it)) : controller.close(); }, + cancel() { (it && (it.return && it.return()) || true) && (it = null); } + }, { highWaterMark: bm ? hwm : undefined, ...options }); + + function next(controller: ReadableStreamDefaultController, it: Iterator) { + let buf: Uint8Array; + let r: IteratorResult | null = null; + let size = controller.desiredSize || null; + while (!(r = it.next(bm ? size : null)).done) { + if (ArrayBuffer.isView(r.value) && (buf = toUint8Array(r.value))) { + size != null && bm && (size = size - buf.byteLength + 1); + r.value = buf; + } + controller.enqueue(r.value); + if (size != null && --size <= 0) { return; } + } + controller.close(); + } +} + +/** @ignore */ +function asyncIterableAsReadableDOMStream(source: AsyncIterable, options?: ReadableDOMStreamOptions) { + + let it: AsyncIterator | null = null; + const bm = (options && options.type === 'bytes') || false; + const hwm = options && options.highWaterMark || (2 ** 24); + + return new ReadableStream({ + ...options as any, + async start(controller) { await next(controller, it || (it = source[Symbol.asyncIterator]())); }, + async pull(controller) { it ? (await next(controller, it)) : controller.close(); }, + async cancel() { (it && (it.return && await it.return()) || true) && (it = null); }, + }, { highWaterMark: bm ? hwm : undefined, ...options }); + + async function next(controller: ReadableStreamDefaultController, it: AsyncIterator) { + let buf: Uint8Array; + let r: IteratorResult | null = null; + let size = controller.desiredSize || null; + while (!(r = await it.next(bm ? size : null)).done) { + if (ArrayBuffer.isView(r.value) && (buf = toUint8Array(r.value))) { + size != null && bm && (size = size - buf.byteLength + 1); + r.value = buf; + } + controller.enqueue(r.value); + if (size != null && --size <= 0) { return; } + } + controller.close(); + } +} diff --git a/js/src/ipc/whatwg/reader.ts b/js/src/ipc/whatwg/reader.ts new file mode 100644 index 0000000000000..3e39900fe27e5 --- /dev/null +++ b/js/src/ipc/whatwg/reader.ts @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteQueue } from '../../io/stream'; +import { RecordBatchReader } from '../../ipc/reader'; + +/** @ignore */ +export function recordBatchReaderThroughDOMStream(writableStrategy?: ByteLengthQueuingStrategy, readableStrategy?: { autoDestroy: boolean }) { + + const queue = new AsyncByteQueue(); + let reader: RecordBatchReader | null = null; + + const readable = new ReadableStream>({ + async cancel() { await queue.close(); }, + async start(controller) { await next(controller, reader || (reader = await open())); }, + async pull(controller) { reader ? await next(controller, reader) : controller.close(); } + }); + + return { writable: new WritableStream(queue, { 'highWaterMark': 2 ** 14, ...writableStrategy }), readable }; + + async function open() { + return await (await RecordBatchReader.from(queue)).open(readableStrategy); + } + + async function next(controller: ReadableStreamDefaultController>, reader: RecordBatchReader) { + let size = controller.desiredSize; + let r: IteratorResult> | null = null; + while (!(r = await reader.next()).done) { + controller.enqueue(r.value); + if (size != null && --size <= 0) { + return; + } + } + controller.close(); + } +} diff --git a/js/src/ipc/whatwg/writer.ts b/js/src/ipc/whatwg/writer.ts new file mode 100644 index 0000000000000..de3b3f1d2474a --- /dev/null +++ b/js/src/ipc/whatwg/writer.ts @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteStream } from '../../io/stream'; +import { RecordBatchWriter } from '../../ipc/writer'; + +/** @ignore */ +export function recordBatchWriterThroughDOMStream( + this: typeof RecordBatchWriter, + writableStrategy?: QueuingStrategy> & { autoDestroy: boolean }, + readableStrategy?: { highWaterMark?: number, size?: any } +) { + + const writer = new this(writableStrategy); + const reader = new AsyncByteStream(writer); + const readable = new ReadableStream({ + type: 'bytes', + async cancel() { await reader.cancel(); }, + async pull(controller) { await next(controller); }, + async start(controller) { await next(controller); }, + }, { 'highWaterMark': 2 ** 14, ...readableStrategy }); + + return { writable: new WritableStream(writer, writableStrategy), readable }; + + async function next(controller: ReadableStreamDefaultController) { + let buf: Uint8Array | null = null; + let size = controller.desiredSize; + while (buf = await reader.read(size || null)) { + controller.enqueue(buf); + if (size != null && (size -= buf.byteLength) <= 0) { return; } + } + controller.close(); + } +} diff --git a/js/src/ipc/writer.ts b/js/src/ipc/writer.ts new file mode 100644 index 0000000000000..746e5ef58e369 --- /dev/null +++ b/js/src/ipc/writer.ts @@ -0,0 +1,417 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Table } from '../table'; +import { MAGIC } from './message'; +import { Vector } from '../vector'; +import { Column } from '../column'; +import { Schema, Field } from '../schema'; +import { Chunked } from '../vector/chunked'; +import { Message } from './metadata/message'; +import { RecordBatch } from '../recordbatch'; +import * as metadata from './metadata/message'; +import { DataType, Dictionary } from '../type'; +import { FileBlock, Footer } from './metadata/file'; +import { MessageHeader, MetadataVersion } from '../enum'; +import { WritableSink, AsyncByteQueue } from '../io/stream'; +import { VectorAssembler } from '../visitor/vectorassembler'; +import { JSONTypeAssembler } from '../visitor/jsontypeassembler'; +import { JSONVectorAssembler } from '../visitor/jsonvectorassembler'; +import { ArrayBufferViewInput, toUint8Array } from '../util/buffer'; +import { Writable, ReadableInterop, ReadableDOMStreamOptions } from '../io/interfaces'; +import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream } from '../util/compat'; + +export class RecordBatchWriter extends ReadableInterop implements Writable> { + + /** @nocollapse */ + // @ts-ignore + public static throughNode(options?: import('stream').DuplexOptions & { autoDestroy: boolean }): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + public static throughDOM( + // @ts-ignore + writableStrategy?: QueuingStrategy> & { autoDestroy: boolean }, + // @ts-ignore + readableStrategy?: { highWaterMark?: number, size?: any } + ): { writable: WritableStream | RecordBatch>, readable: ReadableStream } { + throw new Error(`"throughDOM" not available in this environment`); + } + + constructor(options?: { autoDestroy: boolean }) { + super(); + this._autoDestroy = options && (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true; + } + + protected _position = 0; + protected _started = false; + protected _autoDestroy: boolean; + // @ts-ignore + protected _sink = new AsyncByteQueue(); + protected _schema: Schema | null = null; + protected _dictionaryBlocks: FileBlock[] = []; + protected _recordBatchBlocks: FileBlock[] = []; + + public toString(sync: true): string; + public toString(sync?: false): Promise; + public toString(sync: any = false) { + return this._sink.toString(sync) as Promise | string; + } + public toUint8Array(sync: true): Uint8Array; + public toUint8Array(sync?: false): Promise; + public toUint8Array(sync: any = false) { + return this._sink.toUint8Array(sync) as Promise | Uint8Array; + } + + public writeAll(input: Table | Iterable>): this; + public writeAll(input: AsyncIterable>): Promise; + public writeAll(input: PromiseLike>>): Promise; + public writeAll(input: PromiseLike | Iterable>>): Promise; + public writeAll(input: PromiseLike | Table | Iterable> | AsyncIterable>) { + if (isPromise(input)) { + return input.then((x) => this.writeAll(x)); + } else if (isAsyncIterable>(input)) { + return writeAllAsync(this, input); + } + return writeAll(this, input); + } + + public get closed() { return this._sink.closed; } + public [Symbol.asyncIterator]() { return this._sink[Symbol.asyncIterator](); } + public toDOMStream(options?: ReadableDOMStreamOptions) { return this._sink.toDOMStream(options); } + public toNodeStream(options?: import('stream').ReadableOptions) { return this._sink.toNodeStream(options); } + + public close() { + return this.reset()._sink.close(); + } + public abort(reason?: any) { + return this.reset()._sink.abort(reason); + } + public finish() { + this._autoDestroy ? this.close() : this.reset(this._sink, this._schema); + return this; + } + public reset(sink: WritableSink = this._sink, schema: Schema | null = null) { + + if ((sink === this._sink) || (sink instanceof AsyncByteQueue)) { + this._sink = sink as AsyncByteQueue; + } else { + this._sink = new AsyncByteQueue(); + if (sink && isWritableDOMStream(sink)) { + this.toDOMStream({ type: 'bytes' }).pipeTo(sink); + } else if (sink && isWritableNodeStream(sink)) { + this.toNodeStream({ objectMode: false }).pipe(sink); + } + } + + if (this._started && this._schema) { + this._writeFooter(); + } + + this._started = false; + this._dictionaryBlocks = []; + this._recordBatchBlocks = []; + + if (!schema || (schema !== this._schema)) { + if (schema === null) { + this._position = 0; + this._schema = null; + } else { + this._started = true; + this._schema = schema; + this._writeSchema(schema); + } + } + + return this; + } + + public write(chunk?: Table | RecordBatch | null) { + let schema: Schema | null; + if (!this._sink) { + throw new Error(`RecordBatchWriter is closed`); + } else if (!chunk || !(schema = chunk.schema)) { + return this.finish() && undefined; + } else if (schema !== this._schema) { + if (this._started && this._autoDestroy) { + return this.close(); + } + this.reset(this._sink, schema); + } + (chunk instanceof Table) + ? this.writeAll(chunk.chunks) + : this._writeRecordBatch(chunk); + } + + protected _writeMessage(message: Message, alignment = 8) { + + const a = alignment - 1; + const buffer = Message.encode(message); + const flatbufferSize = buffer.byteLength; + const alignedSize = (flatbufferSize + 4 + a) & ~a; + const nPaddingBytes = alignedSize - flatbufferSize - 4; + + if (message.headerType === MessageHeader.RecordBatch) { + this._recordBatchBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position)); + } else if (message.headerType === MessageHeader.DictionaryBatch) { + this._dictionaryBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position)); + } + + // Write the flatbuffer size prefix including padding + this._write(Int32Array.of(alignedSize - 4)); + // Write the flatbuffer + if (flatbufferSize > 0) { this._write(buffer); } + // Write any padding + return this._writePadding(nPaddingBytes); + } + + protected _write(chunk: ArrayBufferViewInput) { + if (this._started) { + const buffer = toUint8Array(chunk); + if (buffer && buffer.byteLength > 0) { + this._sink.write(buffer); + this._position += buffer.byteLength; + } + } + return this; + } + + protected _writeSchema(schema: Schema) { + return this + ._writeMessage(Message.from(schema)) + ._writeDictionaries(schema.dictionaryFields); + } + + protected _writeFooter() { + return this._writePadding(4); // eos bytes + } + + protected _writeMagic() { + return this._write(MAGIC); + } + + protected _writePadding(nBytes: number) { + return nBytes > 0 ? this._write(new Uint8Array(nBytes)) : this; + } + + protected _writeRecordBatch(records: RecordBatch) { + const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(records); + const recordBatch = new metadata.RecordBatch(records.length, nodes, bufferRegions); + const message = Message.from(recordBatch, byteLength); + return this + ._writeMessage(message) + ._writeBodyBuffers(buffers); + } + + protected _writeDictionaryBatch(dictionary: Vector, id: number, isDelta = false) { + const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(dictionary); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions); + const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); + const message = Message.from(dictionaryBatch, byteLength); + return this + ._writeMessage(message) + ._writeBodyBuffers(buffers); + } + + protected _writeBodyBuffers(buffers: ArrayBufferView[]) { + let buffer: ArrayBufferView; + let size: number, padding: number; + for (let i = -1, n = buffers.length; ++i < n;) { + if ((buffer = buffers[i]) && (size = buffer.byteLength) > 0) { + this._write(buffer); + if ((padding = ((size + 7) & ~7) - size) > 0) { + this._writePadding(padding); + } + } + } + return this; + } + + protected _writeDictionaries(dictionaryFields: Map>[]>) { + for (const [id, fields] of dictionaryFields) { + const vector = fields[0].type.dictionaryVector; + if (!(vector instanceof Chunked)) { + this._writeDictionaryBatch(vector, id, false); + } else { + const chunks = vector.chunks; + for (let i = -1, n = chunks.length; ++i < n;) { + this._writeDictionaryBatch(chunks[i], id, i > 0); + } + } + } + return this; + } +} + +/** @ignore */ +export class RecordBatchStreamWriter extends RecordBatchWriter { + + public static writeAll(this: typeof RecordBatchWriter, input: Table | Iterable>, options?: { autoDestroy: true }): RecordBatchStreamWriter; + // @ts-ignore + public static writeAll(this: typeof RecordBatchWriter, input: AsyncIterable>, options?: { autoDestroy: true }): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike>>, options?: { autoDestroy: true }): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike | Iterable>>, options?: { autoDestroy: true }): Promise>; + /** @nocollapse */ + public static writeAll(this: typeof RecordBatchWriter, input: any, options?: { autoDestroy: true }) { + return new RecordBatchStreamWriter(options).writeAll(input); + } +} + +/** @ignore */ +export class RecordBatchFileWriter extends RecordBatchWriter { + + public static writeAll(this: typeof RecordBatchWriter, input: Table | Iterable>): RecordBatchFileWriter; + // @ts-ignore + public static writeAll(this: typeof RecordBatchWriter, input: AsyncIterable>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike>>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike | Iterable>>): Promise>; + /** @nocollapse */ + public static writeAll(this: typeof RecordBatchWriter, input: any) { + return new RecordBatchFileWriter().writeAll(input); + } + + constructor() { + super(); + this._autoDestroy = true; + } + + protected _writeSchema(schema: Schema) { + return this + ._writeMagic()._writePadding(2) + ._writeDictionaries(schema.dictionaryFields); + } + + protected _writeFooter() { + const buffer = Footer.encode(new Footer( + this._schema!, MetadataVersion.V4, + this._recordBatchBlocks, this._dictionaryBlocks + )); + return this + ._write(buffer) // Write the flatbuffer + ._write(Int32Array.of(buffer.byteLength)) // then the footer size suffix + ._writeMagic(); // then the magic suffix + } +} + +/** @ignore */ +export class RecordBatchJSONWriter extends RecordBatchWriter { + + public static writeAll(this: typeof RecordBatchWriter, input: Table | Iterable>): RecordBatchJSONWriter; + // @ts-ignore + public static writeAll(this: typeof RecordBatchWriter, input: AsyncIterable>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike>>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike | Iterable>>): Promise>; + /** @nocollapse */ + public static writeAll(this: typeof RecordBatchWriter, input: any) { + return new RecordBatchJSONWriter().writeAll(input as any); + } + + constructor() { + super(); + this._autoDestroy = true; + } + + protected _writeMessage() { return this; } + protected _writeSchema(schema: Schema) { + return this._write(`{\n "schema": ${ + JSON.stringify({ fields: schema.fields.map(fieldToJSON) }, null, 2) + }`)._writeDictionaries(schema.dictionaryFields); + } + protected _writeDictionaries(dictionaryFields: Map>[]>) { + this._write(`,\n "dictionaries": [\n`); + super._writeDictionaries(dictionaryFields); + return this._write(`\n ]`); + } + protected _writeDictionaryBatch(dictionary: Vector, id: number, isDelta = false) { + this._write(this._dictionaryBlocks.length === 0 ? ` ` : `,\n `); + this._write(`${dictionaryBatchToJSON(this._schema!, dictionary, id, isDelta)}`); + this._dictionaryBlocks.push(new FileBlock(0, 0, 0)); + return this; + } + protected _writeRecordBatch(records: RecordBatch) { + this._write(this._recordBatchBlocks.length === 0 + ? `,\n "batches": [\n ` + : `,\n `); + this._write(`${recordBatchToJSON(records)}`); + this._recordBatchBlocks.push(new FileBlock(0, 0, 0)); + return this; + } + public close() { + if (this._recordBatchBlocks.length > 0) { + this._write(`\n ]`); + } + if (this._schema) { + this._write(`\n}`); + } + return super.close(); + } +} + +/** @ignore */ +function writeAll(writer: RecordBatchWriter, input: Table | Iterable>) { + const chunks = (input instanceof Table) ? input.chunks : input; + for (const batch of chunks) { + writer.write(batch); + } + return writer.finish(); +} + +/** @ignore */ +async function writeAllAsync(writer: RecordBatchWriter, batches: AsyncIterable>) { + for await (const batch of batches) { + writer.write(batch); + } + return writer.finish(); +} + +/** @ignore */ +function fieldToJSON({ name, type, nullable }: Field): object { + const assembler = new JSONTypeAssembler(); + return { + 'name': name, 'nullable': nullable, + 'type': assembler.visit(type), + 'children': (type.children || []).map(fieldToJSON), + 'dictionary': !DataType.isDictionary(type) ? undefined : { + 'id': type.id, + 'isOrdered': type.isOrdered, + 'indexType': assembler.visit(type.indices) + } + }; +} + +/** @ignore */ +function dictionaryBatchToJSON(schema: Schema, dictionary: Vector, id: number, isDelta = false) { + const f = schema.dictionaryFields.get(id)![0]; + const field = new Field(f.name, f.type.dictionary, f.nullable, f.metadata); + const columns = JSONVectorAssembler.assemble(new Column(field, [dictionary])); + return JSON.stringify({ + 'id': id, + 'isDelta': isDelta, + 'data': { + 'count': dictionary.length, + 'columns': columns + } + }, null, 2); +} + +/** @ignore */ +function recordBatchToJSON(records: RecordBatch) { + return JSON.stringify({ + 'count': records.length, + 'columns': JSONVectorAssembler.assemble(records) + }, null, 2); +} diff --git a/js/src/ipc/writer/binary.ts b/js/src/ipc/writer/binary.ts deleted file mode 100644 index df7c586d94ab5..0000000000000 --- a/js/src/ipc/writer/binary.ts +++ /dev/null @@ -1,725 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Table } from '../../table'; -import { DenseUnionData } from '../../data'; -import { RecordBatch } from '../../recordbatch'; -import { VectorVisitor, TypeVisitor } from '../../visitor'; -import { MAGIC, magicLength, magicAndPadding, PADDING } from '../magic'; -import { align, getBool, packBools, iterateBits } from '../../util/bit'; -import { Vector, UnionVector, DictionaryVector, NestedVector, ListVector } from '../../vector'; -import { BufferMetadata, FieldMetadata, Footer, FileBlock, Message, RecordBatchMetadata, DictionaryBatch } from '../metadata'; -import { - Schema, Field, TypedArray, MetadataVersion, - DataType, - Dictionary, - Null, Int, Float, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, - FlatType, FlatListType, NestedType, UnionMode, SparseUnion, DenseUnion, SingleNestedType, -} from '../../type'; - -export function* serializeStream(table: Table) { - yield serializeMessage(table.schema).buffer; - for (const [id, field] of table.schema.dictionaries) { - const vec = table.getColumn(field.name) as any as DictionaryVector; - if (vec && vec.dictionary) { - yield serializeDictionaryBatch(vec.dictionary, id).buffer; - } - } - for (const recordBatch of table.batches) { - yield serializeRecordBatch(recordBatch).buffer; - } -} - -export function* serializeFile(table: Table) { - - const recordBatches = []; - const dictionaryBatches = []; - - // First yield the magic string (aligned) - let buffer = new Uint8Array(align(magicLength, 8)); - let metadataLength, bodyLength, byteLength = buffer.byteLength; - buffer.set(MAGIC, 0); - yield buffer; - - // Then yield the schema - ({ metadataLength, buffer } = serializeMessage(table.schema)); - byteLength += buffer.byteLength; - yield buffer; - - for (const [id, field] of table.schema.dictionaries) { - const vec = table.getColumn(field.name) as any as DictionaryVector; - if (vec && vec.dictionary) { - ({ metadataLength, bodyLength, buffer } = serializeDictionaryBatch(vec.dictionary, id)); - dictionaryBatches.push(new FileBlock(metadataLength, bodyLength, byteLength)); - byteLength += buffer.byteLength; - yield buffer; - } - } - for (const recordBatch of table.batches) { - ({ metadataLength, bodyLength, buffer } = serializeRecordBatch(recordBatch)); - recordBatches.push(new FileBlock(metadataLength, bodyLength, byteLength)); - byteLength += buffer.byteLength; - yield buffer; - } - - // Then yield the footer metadata (not aligned) - ({ metadataLength, buffer } = serializeFooter(new Footer(dictionaryBatches, recordBatches, table.schema))); - yield buffer; - - // Last, yield the footer length + terminating magic arrow string (aligned) - buffer = new Uint8Array(magicAndPadding); - new DataView(buffer.buffer).setInt32(0, metadataLength, platformIsLittleEndian); - buffer.set(MAGIC, buffer.byteLength - magicLength); - yield buffer; -} - -export function serializeRecordBatch(recordBatch: RecordBatch) { - const { byteLength, fieldNodes, buffers, buffersMeta } = new RecordBatchSerializer().visitRecordBatch(recordBatch); - const rbMeta = new RecordBatchMetadata(MetadataVersion.V4, recordBatch.length, fieldNodes, buffersMeta); - const rbData = concatBuffersWithMetadata(byteLength, buffers, buffersMeta); - return serializeMessage(rbMeta, rbData); -} - -export function serializeDictionaryBatch(dictionary: Vector, id: Long | number, isDelta: boolean = false) { - const { byteLength, fieldNodes, buffers, buffersMeta } = new RecordBatchSerializer().visitRecordBatch(RecordBatch.from([dictionary])); - const rbMeta = new RecordBatchMetadata(MetadataVersion.V4, dictionary.length, fieldNodes, buffersMeta); - const dbMeta = new DictionaryBatch(MetadataVersion.V4, rbMeta, id, isDelta); - const rbData = concatBuffersWithMetadata(byteLength, buffers, buffersMeta); - return serializeMessage(dbMeta, rbData); -} - -export function serializeMessage(message: Message, data?: Uint8Array) { - const b = new Builder(); - _Message.finishMessageBuffer(b, writeMessage(b, message)); - // Slice out the buffer that contains the message metadata - const metadataBytes = b.asUint8Array(); - // Reserve 4 bytes for writing the message size at the front. - // Metadata length includes the metadata byteLength + the 4 - // bytes for the length, and rounded up to the nearest 8 bytes. - const metadataLength = align(PADDING + metadataBytes.byteLength, 8); - // + the length of the optional data buffer at the end, padded - const dataByteLength = data ? data.byteLength : 0; - // ensure the entire message is aligned to an 8-byte boundary - const messageBytes = new Uint8Array(align(metadataLength + dataByteLength, 8)); - // Write the metadata length into the first 4 bytes, but subtract the - // bytes we use to hold the length itself. - new DataView(messageBytes.buffer).setInt32(0, metadataLength - PADDING, platformIsLittleEndian); - // Copy the metadata bytes into the message buffer - messageBytes.set(metadataBytes, PADDING); - // Copy the optional data buffer after the metadata bytes - (data && dataByteLength > 0) && messageBytes.set(data, metadataLength); - // if (messageBytes.byteLength % 8 !== 0) { debugger; } - // Return the metadata length because we need to write it into each FileBlock also - return { metadataLength, bodyLength: message.bodyLength, buffer: messageBytes }; -} - -export function serializeFooter(footer: Footer) { - const b = new Builder(); - _Footer.finishFooterBuffer(b, writeFooter(b, footer)); - // Slice out the buffer that contains the footer metadata - const footerBytes = b.asUint8Array(); - const metadataLength = footerBytes.byteLength; - return { metadataLength, buffer: footerBytes }; -} - -export class RecordBatchSerializer extends VectorVisitor { - public byteLength = 0; - public buffers: TypedArray[] = []; - public fieldNodes: FieldMetadata[] = []; - public buffersMeta: BufferMetadata[] = []; - public visitRecordBatch(recordBatch: RecordBatch) { - this.buffers = []; - this.byteLength = 0; - this.fieldNodes = []; - this.buffersMeta = []; - for (let vector: Vector, index = -1, numCols = recordBatch.numCols; ++index < numCols;) { - if (vector = recordBatch.getChildAt(index)!) { - this.visit(vector); - } - } - return this; - } - public visit(vector: Vector) { - if (!DataType.isDictionary(vector.type)) { - const { data, length, nullCount } = vector; - if (length > 2147483647) { - throw new RangeError('Cannot write arrays larger than 2^31 - 1 in length'); - } - this.fieldNodes.push(new FieldMetadata(length, nullCount)); - this.addBuffer(nullCount <= 0 - ? new Uint8Array(0) // placeholder validity buffer - : this.getTruncatedBitmap(data.offset, length, data.nullBitmap!) - ); - } - return super.visit(vector); - } - public visitNull (_nullz: Vector) { return this; } - public visitBool (vector: Vector) { return this.visitBoolVector(vector); } - public visitInt (vector: Vector) { return this.visitFlatVector(vector); } - public visitFloat (vector: Vector) { return this.visitFlatVector(vector); } - public visitUtf8 (vector: Vector) { return this.visitFlatListVector(vector); } - public visitBinary (vector: Vector) { return this.visitFlatListVector(vector); } - public visitDate (vector: Vector) { return this.visitFlatVector(vector); } - public visitTimestamp (vector: Vector) { return this.visitFlatVector(vector); } - public visitTime (vector: Vector