diff --git a/.env b/.env index 4f382986dabda..aa9e9c7667078 100644 --- a/.env +++ b/.env @@ -27,6 +27,12 @@ # the cache plugin functional DOCKER_VOLUME_PREFIX= +# turn on inline build cache, this is a docker buildx feature documented +# at https://github.com/docker/buildx#--cache-tonametypetypekeyvalue +COMPOSE_DOCKER_CLI_BUILD=1 +DOCKER_BUILDKIT=1 +BUILDKIT_INLINE_CACHE=1 + ULIMIT_CORE=-1 REPO=apache/arrow-dev ARCH=amd64 diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 48f74f7cd88c0..5f25deb45126f 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -38,10 +38,8 @@ on: - 'format/Flight.proto' env: - DOCKER_BUILDKIT: 0 - DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -72,7 +70,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }} @@ -367,7 +365,7 @@ jobs: run: | ci/scripts/msys2_setup.sh cpp - name: Cache ccache - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: ccache key: cpp-ccache-mingw${{ matrix.mingw-n-bits }}-${{ hashFiles('cpp/**') }} diff --git a/.github/workflows/cpp_cron.yml b/.github/workflows/cpp_cron.yml index 5cd692b126c90..c229ad93be306 100644 --- a/.github/workflows/cpp_cron.yml +++ b/.github/workflows/cpp_cron.yml @@ -30,10 +30,8 @@ on: 0 */12 * * * env: - DOCKER_BUILDKIT: 0 - DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -83,7 +81,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ${{ matrix.name }}-${{ hashFiles('cpp/**') }} diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 59740f3c3d8a4..37016efcbfe2b 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -23,8 +23,6 @@ on: pull_request: env: - DOCKER_BUILDKIT: 0 - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 663a9dbb48b31..60a96081a1908 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -33,8 +33,6 @@ on: - 'go/**' env: - DOCKER_BUILDKIT: 0 - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7cf2ddfaa2b85..20112553ea25d 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -44,9 +44,7 @@ on: - 'rust/**' env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -66,7 +64,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: conda-${{ hashFiles('cpp/**') }} diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 38f58847df069..7f6f29f0f4440 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -36,9 +36,7 @@ on: - 'java/**' env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -68,7 +66,7 @@ jobs: shell: bash run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: maven-${{ hashFiles('java/**') }} diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 79ba50ef6e8c2..5f25e8c053d8c 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -36,9 +36,7 @@ on: - 'java/**' env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -66,7 +64,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: maven-${{ hashFiles('java/**') }} diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index e84a812a07830..c021e99c4051a 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -32,8 +32,6 @@ on: - 'js/**' env: - DOCKER_BUILDKIT: 0 - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 254d94942331f..9062e93e66515 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -32,9 +32,7 @@ on: - 'python/**' env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -84,7 +82,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} diff --git a/.github/workflows/python_cron.yml b/.github/workflows/python_cron.yml index 88007bac2b94d..7a4401af1c3bf 100644 --- a/.github/workflows/python_cron.yml +++ b/.github/workflows/python_cron.yml @@ -29,9 +29,7 @@ on: 0 */12 * * * env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -121,7 +119,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 066c274446f1c..8869de77b347f 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -40,9 +40,7 @@ on: - 'r/**' env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -69,7 +67,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/**') }} @@ -117,7 +115,7 @@ jobs: - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ${{ matrix.config.image }}-r-${{ hashFiles('cpp/**') }} diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 6ade7d853b5f4..d9430f536b2df 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -44,9 +44,7 @@ on: - 'ruby/**' env: - DOCKER_BUILDKIT: 0 DOCKER_VOLUME_PREFIX: ".docker/" - COMPOSE_DOCKER_CLI_BUILD: 1 ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -76,7 +74,7 @@ jobs: shell: bash run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: .docker key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index c3082271ca562..17100e76a3c12 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -1592,6 +1592,7 @@ garrow_csv_reader_new(GArrowInputStream *input, auto arrow_reader = arrow::csv::TableReader::Make(arrow::default_memory_pool(), + arrow::io::AsyncContext(), arrow_input, read_options, parse_options, diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile b/ci/docker/python-wheel-manylinux-201x.dockerfile index 4be0c97a66bc6..2bdb7a926cdb8 100644 --- a/ci/docker/python-wheel-manylinux-201x.dockerfile +++ b/ci/docker/python-wheel-manylinux-201x.dockerfile @@ -62,7 +62,8 @@ ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_FORCE_SYSTEM_BINARIES=1 \ VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \ - VCPKG_DEFAULT_TRIPLET=x64-linux-static-${build_type} + VCPKG_DEFAULT_TRIPLET=x64-linux-static-${build_type} \ + VCPKG_FEATURE_FLAGS=-manifests # TODO(kszucs): factor out the package enumeration to a text file and reuse it # from the windows image and potentially in a future macos wheel build diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index ecd58b4462de5..c0b85d4793868 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -46,7 +46,8 @@ COPY ci/vcpkg arrow/ci/vcpkg ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_OVERLAY_TRIPLETS=C:\\arrow\\ci\\vcpkg \ - VCPKG_DEFAULT_TRIPLET=x64-windows-static-md-${build_type} + VCPKG_DEFAULT_TRIPLET=x64-windows-static-md-${build_type} \ + VCPKG_FEATURE_FLAGS=-manifests RUN vcpkg install --clean-after-build \ abseil \ aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] \ diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 4306f644082aa..1d9e41bba7a60 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -25,6 +25,8 @@ arch=("any") url="https://arrow.apache.org/" license=("Apache-2.0") depends=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp" + "${MINGW_PACKAGE_PREFIX}-libutf8proc" + "${MINGW_PACKAGE_PREFIX}-re2" "${MINGW_PACKAGE_PREFIX}-thrift" "${MINGW_PACKAGE_PREFIX}-snappy" "${MINGW_PACKAGE_PREFIX}-zlib" @@ -103,9 +105,7 @@ build() { -DARROW_SNAPPY_USE_SHARED=OFF \ -DARROW_USE_GLOG=OFF \ -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_RE2=OFF \ -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_UTF8PROC=OFF \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ -DARROW_ZSTD_USE_SHARED=OFF \ diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index d7e239b7c0780..1bf0a3b889406 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -86,6 +86,7 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then ${binary_output_dir}/arrow-ipc-stream-fuzz ${ARROW_TEST_DATA}/arrow-ipc-stream/crash-* ${binary_output_dir}/arrow-ipc-stream-fuzz ${ARROW_TEST_DATA}/arrow-ipc-stream/*-testcase-* ${binary_output_dir}/arrow-ipc-file-fuzz ${ARROW_TEST_DATA}/arrow-ipc-file/*-testcase-* + ${binary_output_dir}/arrow-ipc-tensor-stream-fuzz ${ARROW_TEST_DATA}/arrow-ipc-tensor-stream/*-testcase-* if [ "${ARROW_PARQUET}" == "ON" ]; then ${binary_output_dir}/parquet-arrow-fuzz ${ARROW_TEST_DATA}/parquet/fuzzing/*-testcase-* fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index abd53759d8345..aa23e5b7c1858 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -30,4 +30,6 @@ pip install -e $arrow_dir/dev/archery archery integration --with-all --run-flight \ --gold-dirs=$gold_dir/0.14.1 \ --gold-dirs=$gold_dir/0.17.1 \ + --gold-dirs=$gold_dir/1.0.0-bigendian \ + --gold-dirs=$gold_dir/1.0.0-littleendian \ --gold-dirs=$gold_dir/2.0.0-compression \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 68e75c39abf03..0a52415a0b9d2 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -67,6 +67,7 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_WITH_ZSTD:=ON} : ${CMAKE_BUILD_TYPE:=release} : ${CMAKE_GENERATOR:=Ninja} +: ${VCPKG_FEATURE_FLAGS:=-manifests} mkdir /tmp/arrow-build pushd /tmp/arrow-build @@ -106,6 +107,8 @@ cmake \ -DCMAKE_UNITY_BUILD=ON \ -DOPENSSL_USE_STATIC_LIBS=ON \ -DThrift_ROOT=/opt/vcpkg/installed/x64-linux/lib \ + -D_VCPKG_INSTALLED_DIR=/opt/vcpkg/installed \ + -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=x64-linux-static-${CMAKE_BUILD_TYPE} \ -G ${CMAKE_GENERATOR} \ /arrow/cpp diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index a9a596be8931a..f61a2faea0d70 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -47,6 +47,7 @@ set ARROW_WITH_ZLIB=ON set ARROW_WITH_ZSTD=ON set CMAKE_UNITY_BUILD=ON set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +set VCPKG_FEATURE_FLAGS=-manifests mkdir C:\arrow-build pushd C:\arrow-build @@ -83,6 +84,8 @@ cmake ^ -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^ -DCMAKE_UNITY_BUILD=%CMAKE_UNITY_BUILD% ^ -DMSVC_LINK_VERBOSE=ON ^ + -D_VCPKG_INSTALLED_DIR=C:\vcpkg\installed ^ + -DVCPKG_MANIFEST_MODE=OFF ^ -DVCPKG_TARGET_TRIPLET=x64-windows-static-md-%CMAKE_BUILD_TYPE% ^ -G "%CMAKE_GENERATOR%" ^ C:\arrow\cpp || exit /B diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index cb33e676a7dc2..be03b75f5add5 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -96,8 +96,8 @@ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x6 cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386 # These are from https://dl.bintray.com/rtools/mingw{32,64}/ -cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,aws*}.a $DST_DIR/lib/x64 -cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,aws*}.a $DST_DIR/lib/i386 +cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 +cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 # Create build artifact zip -r ${DST_DIR}.zip $DST_DIR diff --git a/cpp/CMakeSettings.json b/cpp/CMakeSettings.json new file mode 100644 index 0000000000000..90d3abbcadd17 --- /dev/null +++ b/cpp/CMakeSettings.json @@ -0,0 +1,21 @@ +{ + "configurations": [ + { + "name": "x64-Debug (default)", + "generator": "Ninja", + "configurationType": "Debug", + "inheritEnvironments": [ "msvc_x64_x64" ], + "buildRoot": "${projectDir}\\out\\build\\${name}", + "installRoot": "${projectDir}\\out\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "variables": [ + { + "name":"VCPKG_MANIFEST_MODE", + "value":"OFF" + } + ] + } + ] +} diff --git a/cpp/examples/minimal_build/example.cc b/cpp/examples/minimal_build/example.cc index 4b6acd2a0dd75..8f58de5777a49 100644 --- a/cpp/examples/minimal_build/example.cc +++ b/cpp/examples/minimal_build/example.cc @@ -39,6 +39,7 @@ Status RunMain(int argc, char** argv) { ARROW_ASSIGN_OR_RAISE( auto csv_reader, arrow::csv::TableReader::Make(arrow::default_memory_pool(), + arrow::io::AsyncContext(), input_file, arrow::csv::ReadOptions::Defaults(), arrow::csv::ParseOptions::Defaults(), diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 1e93cf9975a9f..4403def994932 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -189,7 +189,6 @@ set(ARROW_SRCS util/future.cc util/int_util.cc util/io_util.cc - util/iterator.cc util/logging.cc util/key_value_metadata.cc util/memory.cc diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 9bcd1621840ef..e29db00cfcf08 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -91,7 +91,7 @@ class ARROW_EXPORT Array { /// /// Note that for `null_count == 0` or for null type, this will be null. /// This buffer does not account for any slice offset - std::shared_ptr null_bitmap() const { return data_->buffers[0]; } + const std::shared_ptr& null_bitmap() const { return data_->buffers[0]; } /// Raw pointer to the null bitmap. /// @@ -160,7 +160,7 @@ class ARROW_EXPORT Array { /// Input-checking variant of Array::Slice Result> SliceSafe(int64_t offset) const; - std::shared_ptr data() const { return data_; } + const std::shared_ptr& data() const { return data_; } int num_fields() const { return static_cast(data_->child_data.size()); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 89087ee318c60..a97bf134604e7 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -45,6 +45,7 @@ #include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" +#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_compat.h" #include "arrow/testing/gtest_util.h" @@ -2598,4 +2599,350 @@ TEST(TestRechunkArraysConsistently, Plain) { } } +// ---------------------------------------------------------------------- +// Test SwapEndianArrayData + +/// \brief Indicate if fields are equals. +/// +/// \param[in] target ArrayData to be converted and tested +/// \param[in] expected result ArrayData +void AssertArrayDataEqualsWithSwapEndian(const std::shared_ptr& target, + const std::shared_ptr& expected) { + auto swap_array = MakeArray(*::arrow::internal::SwapEndianArrayData(target)); + auto expected_array = MakeArray(expected); + ASSERT_ARRAYS_EQUAL(*swap_array, *expected_array); + ASSERT_OK(swap_array->ValidateFull()); +} + +TEST(TestSwapEndianArrayData, PrimitiveType) { + auto null_buffer = Buffer::FromString("\xff"); + auto data_int_buffer = Buffer::FromString("01234567"); + + auto data = ArrayData::Make(null(), 0, {nullptr}, 0); + auto expected_data = data; + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(boolean(), 8, {null_buffer, data_int_buffer}, 0); + expected_data = data; + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(int8(), 8, {null_buffer, data_int_buffer}, 0); + expected_data = data; + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(uint16(), 4, {null_buffer, data_int_buffer}, 0); + auto data_int16_buffer = Buffer::FromString("10325476"); + expected_data = ArrayData::Make(uint16(), 4, {null_buffer, data_int16_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(int32(), 2, {null_buffer, data_int_buffer}, 0); + auto data_int32_buffer = Buffer::FromString("32107654"); + expected_data = ArrayData::Make(int32(), 2, {null_buffer, data_int32_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(uint64(), 1, {null_buffer, data_int_buffer}, 0); + auto data_int64_buffer = Buffer::FromString("76543210"); + expected_data = ArrayData::Make(uint64(), 1, {null_buffer, data_int64_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + auto data_16byte_buffer = Buffer::FromString("0123456789abcdef"); + data = ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_16byte_buffer}); + auto data_decimal128_buffer = Buffer::FromString("fedcba9876543210"); + expected_data = + ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_decimal128_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + auto data_32byte_buffer = Buffer::FromString("0123456789abcdef123456789ABCDEF0"); + data = ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_32byte_buffer}); + auto data_decimal256_buffer = Buffer::FromString("0FEDCBA987654321fedcba9876543210"); + expected_data = + ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_decimal256_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + auto data_float_buffer = Buffer::FromString("01200560"); + data = ArrayData::Make(float32(), 2, {null_buffer, data_float_buffer}, 0); + auto data_float32_buffer = Buffer::FromString("02100650"); + expected_data = ArrayData::Make(float32(), 2, {null_buffer, data_float32_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(float64(), 1, {null_buffer, data_float_buffer}); + auto data_float64_buffer = Buffer::FromString("06500210"); + expected_data = ArrayData::Make(float64(), 1, {null_buffer, data_float64_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + // With offset > 0 + data = + ArrayData::Make(int64(), 1, {null_buffer, data_int_buffer}, kUnknownNullCount, 1); + ASSERT_RAISES(Invalid, ::arrow::internal::SwapEndianArrayData(data)); +} + +std::shared_ptr ReplaceBuffers(const std::shared_ptr& data, + const int32_t buffer_index, + const std::vector& buffer_data) { + const auto test_data = data->Copy(); + test_data->buffers[buffer_index] = + std::make_shared(buffer_data.data(), buffer_data.size()); + return test_data; +} + +std::shared_ptr ReplaceBuffersInChild(const std::shared_ptr& data, + const int32_t child_index, + const std::vector& child_data) { + const auto test_data = data->Copy(); + // assume updating only buffer[1] in child_data + auto child_array_data = test_data->child_data[child_index]->Copy(); + child_array_data->buffers[1] = + std::make_shared(child_data.data(), child_data.size()); + test_data->child_data[child_index] = child_array_data; + return test_data; +} + +std::shared_ptr ReplaceBuffersInDictionary( + const std::shared_ptr& data, const int32_t buffer_index, + const std::vector& buffer_data) { + const auto test_data = data->Copy(); + auto dict_array_data = test_data->dictionary->Copy(); + dict_array_data->buffers[buffer_index] = + std::make_shared(buffer_data.data(), buffer_data.size()); + test_data->dictionary = dict_array_data; + return test_data; +} + +TEST(TestSwapEndianArrayData, BinaryType) { + auto array = ArrayFromJSON(binary(), R"(["0123", null, "45"])"); + const std::vector offset1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; +#endif + auto expected_data = array->data(); + auto test_data = ReplaceBuffers(expected_data, 1, offset1); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(large_binary(), R"(["01234", null, "567"])"); + const std::vector offset2 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 8}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset2); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(fixed_size_binary(3), R"(["012", null, "345"])"); + expected_data = array->data(); + AssertArrayDataEqualsWithSwapEndian(expected_data, expected_data); +} + +TEST(TestSwapEndianArrayData, StringType) { + auto array = ArrayFromJSON(utf8(), R"(["ABCD", null, "EF"])"); + const std::vector offset1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; +#endif + auto expected_data = array->data(); + auto test_data = ReplaceBuffers(expected_data, 1, offset1); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(large_utf8(), R"(["ABCDE", null, "FGH"])"); + const std::vector offset2 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 8}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset2); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + +TEST(TestSwapEndianArrayData, ListType) { + auto type1 = std::make_shared(int32()); + auto array = ArrayFromJSON(type1, "[[0, 1, 2, 3], null, [4, 5]]"); + const std::vector offset1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; +#endif + const std::vector data1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5}; +#else + {0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0}; +#endif + auto expected_data = array->data(); + auto test_data = ReplaceBuffers(expected_data, 1, offset1); + test_data = ReplaceBuffersInChild(test_data, 0, data1); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto type2 = std::make_shared(int64()); + array = ArrayFromJSON(type2, "[[0, 1, 2], null, [3]]"); + const std::vector offset2 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0}; +#endif + const std::vector data2 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset2); + test_data = ReplaceBuffersInChild(test_data, 0, data2); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto type3 = std::make_shared(int32(), 2); + array = ArrayFromJSON(type3, "[[0, 1], null, [2, 3]]"); + expected_data = array->data(); + const std::vector data3 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; +#endif + test_data = ReplaceBuffersInChild(expected_data, 0, data3); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + +TEST(TestSwapEndianArrayData, DictionaryType) { + auto type = dictionary(int32(), int16()); + auto dict = ArrayFromJSON(int16(), "[4, 5, 6, 7]"); + DictionaryArray array(type, ArrayFromJSON(int32(), "[0, 2, 3]"), dict); + auto expected_data = array.data(); + const std::vector data1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; +#endif + const std::vector data2 = +#if ARROW_LITTLE_ENDIAN + {0, 4, 0, 5, 0, 6, 0, 7}; +#else + {4, 0, 5, 0, 6, 0, 7, 0}; +#endif + auto test_data = ReplaceBuffers(expected_data, 1, data1); + test_data = ReplaceBuffersInDictionary(test_data, 1, data2); + // dictionary must be explicitly swapped + test_data->dictionary = *::arrow::internal::SwapEndianArrayData(test_data->dictionary); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + +TEST(TestSwapEndianArrayData, StructType) { + auto array = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), + R"([{"a": 4, "b": null}, {"a": null, "b": "foo"}])"); + auto expected_data = array->data(); + const std::vector data1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 4, 0, 0, 0, 0}; +#else + {4, 0, 0, 0, 0, 0, 0, 0}; +#endif + const std::vector data2 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0}; +#endif + auto test_data = ReplaceBuffersInChild(expected_data, 0, data1); + test_data = ReplaceBuffersInChild(test_data, 1, data2); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + +TEST(TestSwapEndianArrayData, UnionType) { + auto expected_i8 = ArrayFromJSON(int8(), "[127, null, null, null, null]"); + auto expected_str = ArrayFromJSON(utf8(), R"([null, "abcd", null, null, ""])"); + auto expected_i32 = ArrayFromJSON(int32(), "[null, null, 1, 2, null]"); + std::vector expected_types_vector; + expected_types_vector.push_back(Type::INT8); + expected_types_vector.insert(expected_types_vector.end(), 2, Type::STRING); + expected_types_vector.insert(expected_types_vector.end(), 2, Type::INT32); + std::shared_ptr expected_types; + ArrayFromVector(expected_types_vector, &expected_types); + auto arr1 = SparseUnionArray::Make( + *expected_types, {expected_i8, expected_str, expected_i32}, {"i8", "str", "i32"}, + {Type::INT8, Type::STRING, Type::INT32}); + auto expected_data = (*arr1)->data(); + const std::vector data1a = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0}; +#endif + const std::vector data1b = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0}; +#endif + auto test_data = ReplaceBuffersInChild(expected_data, 1, data1a); + test_data = ReplaceBuffersInChild(test_data, 2, data1b); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); + expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); + expected_i32 = ArrayFromJSON(int32(), "[1, -259, 2]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); + auto arr2 = DenseUnionArray::Make( + *expected_types, *expected_offsets, {expected_i8, expected_str, expected_i32}, + {"i8", "str", "i32"}, {Type::INT8, Type::STRING, Type::INT32}); + expected_data = (*arr2)->data(); + const std::vector data2a = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, + 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0}; +#endif + const std::vector data2b = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0}; +#endif + const std::vector data2c = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 1, 255, 255, 254, 253, 0, 0, 0, 2}; +#else + {1, 0, 0, 0, 253, 254, 255, 255, 2, 0, 0, 0}; +#endif + test_data = ReplaceBuffers(expected_data, 2, data2a); + test_data = ReplaceBuffersInChild(test_data, 1, data2b); + test_data = ReplaceBuffersInChild(test_data, 2, data2c); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + +TEST(TestSwapEndianArrayData, ExtensionType) { + auto array_int16 = ArrayFromJSON(int16(), "[0, 1, 2, 3]"); + auto ext_data = array_int16->data()->Copy(); + ext_data->type = std::make_shared(); + auto array = MakeArray(ext_data); + auto expected_data = array->data(); + const std::vector data = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 1, 0, 2, 0, 3}; +#else + {0, 0, 1, 0, 2, 0, 3, 0}; +#endif + auto test_data = ReplaceBuffers(expected_data, 1, data); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index e73bbda7abc5a..07dc3014e4029 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -29,7 +29,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 0d498931d4202..297745a2b1754 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -41,6 +41,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -51,7 +52,7 @@ using internal::checked_cast; // ---------------------------------------------------------------------- // Loading from ArrayData -namespace internal { +namespace { class ArrayDataWrapper { public: @@ -74,11 +75,209 @@ class ArrayDataWrapper { std::shared_ptr* out_; }; +class ArrayDataEndianSwapper { + public: + ArrayDataEndianSwapper(const std::shared_ptr& data, int64_t length) + : data_(data), length_(length) { + out_ = data->Copy(); + } + + Status SwapType(const DataType& type) { + RETURN_NOT_OK(VisitTypeInline(type, this)); + RETURN_NOT_OK(SwapChildren(type.fields())); + if (internal::HasValidityBitmap(type.id())) { + // Copy null bitmap + out_->buffers[0] = data_->buffers[0]; + } + return Status::OK(); + } + + Status SwapChildren(const FieldVector& child_fields) { + for (size_t i = 0; i < child_fields.size(); i++) { + ARROW_ASSIGN_OR_RAISE(out_->child_data[i], + internal::SwapEndianArrayData(data_->child_data[i])); + } + return Status::OK(); + } + + template + Result> ByteSwapBuffer( + const std::shared_ptr& in_buffer) { + if (sizeof(T) == 1) { + // if data size is 1, element is not swapped. We can use the original buffer + return in_buffer; + } + auto in_data = reinterpret_cast(in_buffer->data()); + ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); + auto out_data = reinterpret_cast(out_buffer->mutable_data()); + int64_t length = in_buffer->size() / sizeof(T); + for (int64_t i = 0; i < length; i++) { + out_data[i] = BitUtil::ByteSwap(in_data[i]); + } + return std::move(out_buffer); + } + + template + Status SwapOffsets(int index) { + if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) { + out_->buffers[index] = data_->buffers[index]; + return Status::OK(); + } + // Except union, offset has one more element rather than data->length + ARROW_ASSIGN_OR_RAISE(out_->buffers[index], + ByteSwapBuffer(data_->buffers[index])); + return Status::OK(); + } + + template + enable_if_t::value && + !std::is_base_of::value && + !std::is_base_of::value, + Status> + Visit(const T& type) { + using value_type = typename T::c_type; + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], + ByteSwapBuffer(data_->buffers[1])); + return Status::OK(); + } + + Status Visit(const Decimal128Type& type) { + auto data = reinterpret_cast(data_->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); + int64_t length = length_; + length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2); + for (int64_t i = 0; i < length; i++) { + uint64_t tmp; + auto idx = i * 2; +#if ARROW_LITTLE_ENDIAN + tmp = BitUtil::FromBigEndian(data[idx]); + new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]); + new_data[idx + 1] = tmp; +#else + tmp = BitUtil::FromLittleEndian(data[idx]); + new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]); + new_data[idx + 1] = tmp; +#endif + } + out_->buffers[1] = std::move(new_buffer); + return Status::OK(); + } + + Status Visit(const Decimal256Type& type) { + auto data = reinterpret_cast(data_->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); + int64_t length = length_; + length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4); + for (int64_t i = 0; i < length; i++) { + uint64_t tmp0, tmp1, tmp2; + auto idx = i * 4; +#if ARROW_LITTLE_ENDIAN + tmp0 = BitUtil::FromBigEndian(data[idx]); + tmp1 = BitUtil::FromBigEndian(data[idx + 1]); + tmp2 = BitUtil::FromBigEndian(data[idx + 2]); + new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]); + new_data[idx + 1] = tmp2; + new_data[idx + 2] = tmp1; + new_data[idx + 3] = tmp0; +#else + tmp0 = BitUtil::FromLittleEndian(data[idx]); + tmp1 = BitUtil::FromLittleEndian(data[idx + 1]); + tmp2 = BitUtil::FromLittleEndian(data[idx + 2]); + new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]); + new_data[idx + 1] = tmp2; + new_data[idx + 2] = tmp1; + new_data[idx + 3] = tmp0; +#endif + } + out_->buffers[1] = std::move(new_buffer); + return Status::OK(); + } + + Status Visit(const DayTimeIntervalType& type) { + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer(data_->buffers[1])); + return Status::OK(); + } + + Status Visit(const NullType& type) { return Status::OK(); } + Status Visit(const BooleanType& type) { return Status::OK(); } + Status Visit(const Int8Type& type) { return Status::OK(); } + Status Visit(const UInt8Type& type) { return Status::OK(); } + Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } + Status Visit(const FixedSizeListType& type) { return Status::OK(); } + Status Visit(const StructType& type) { return Status::OK(); } + Status Visit(const UnionType& type) { + out_->buffers[1] = data_->buffers[1]; + if (type.mode() == UnionMode::DENSE) { + RETURN_NOT_OK(SwapOffsets(2)); + } + return Status::OK(); + } + + template + enable_if_t::value || std::is_same::value, + Status> + Visit(const T& type) { + RETURN_NOT_OK(SwapOffsets(1)); + out_->buffers[2] = data_->buffers[2]; + return Status::OK(); + } + + template + enable_if_t::value || + std::is_same::value, + Status> + Visit(const T& type) { + RETURN_NOT_OK(SwapOffsets(1)); + out_->buffers[2] = data_->buffers[2]; + return Status::OK(); + } + + Status Visit(const ListType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + return Status::OK(); + } + Status Visit(const LargeListType& type) { + RETURN_NOT_OK(SwapOffsets(1)); + return Status::OK(); + } + + Status Visit(const DictionaryType& type) { + // dictionary was already swapped in ReadDictionary() in ipc/reader.cc + RETURN_NOT_OK(SwapType(*type.index_type())); + return Status::OK(); + } + + Status Visit(const ExtensionType& type) { + RETURN_NOT_OK(SwapType(*type.storage_type())); + return Status::OK(); + } + + const std::shared_ptr& data_; + int64_t length_; + std::shared_ptr out_; +}; + +} // namespace + +namespace internal { + +Result> SwapEndianArrayData( + const std::shared_ptr& data) { + if (data->offset != 0) { + return Status::Invalid("Unsupported data format: data.offset != 0"); + } + ArrayDataEndianSwapper swapper(data, data->length); + RETURN_NOT_OK(swapper.SwapType(*data->type)); + return std::move(swapper.out_); +} + } // namespace internal std::shared_ptr MakeArray(const std::shared_ptr& data) { std::shared_ptr out; - internal::ArrayDataWrapper wrapper_visitor(data, &out); + ArrayDataWrapper wrapper_visitor(data, &out); DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor)); DCHECK(out); return out; diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index b400255c18ea7..3ef4e08828fe5 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -56,6 +56,17 @@ Result> MakeArrayFromScalar( namespace internal { +/// \brief Swap endian of each element in a generic ArrayData +/// +/// As dictionaries are often shared between different arrays, dictionaries +/// are not swapped by this function and should be handled separately. +/// +/// \param[in] data the array contents +/// \return the resulting ArrayData whose elements were swapped +ARROW_EXPORT +Result> SwapEndianArrayData( + const std::shared_ptr& data); + /// Given a number of ArrayVectors, treat each ArrayVector as the /// chunks of a chunked array. Then rechunk each ArrayVector such that /// all ArrayVectors are chunked identically. It is mandatory that diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index fc11f126e72a8..317fd01f17c62 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -33,6 +33,7 @@ #include "arrow/memory_pool.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 3144f5786d7f3..c5907549fe4ef 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -27,7 +27,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 9039bb5c485a1..11e03bba2873a 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -663,14 +663,13 @@ struct ScalarUnaryNotNullStateful { static void Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, Datum* out) { ArrayData* out_arr = out->mutable_array(); - auto out_data = out_arr->GetMutableValues(1); + auto out_data = out_arr->GetMutableValues(1); VisitArrayValuesInline( arg0, [&](Arg0Value v) { - functor.op.template Call(ctx, v).ToBytes(out_data); - out_data += 16; + *out_data++ = functor.op.template Call(ctx, v); }, - [&]() { out_data += 16; }); + [&]() { ++out_data; }); } }; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 3cef4026fb6f2..2592b77ab66e5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -21,37 +21,71 @@ #include #include "arrow/array/builder_nested.h" +#include "arrow/compute/api_scalar.h" #include "arrow/compute/cast.h" #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/scalar_cast_internal.h" +#include "arrow/util/bitmap_ops.h" namespace arrow { + +using internal::CopyBitmap; + namespace compute { namespace internal { template void CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const CastOptions& options = checked_cast(*ctx->state()).options; + using offset_type = typename Type::offset_type; + using ScalarType = typename TypeTraits::ScalarType; + + const CastOptions& options = CastState::Get(ctx); - const ArrayData& input = *batch[0].array(); - ArrayData* result = out->mutable_array(); + auto child_type = checked_cast(*out->type()).value_type(); - if (input.offset != 0) { - ctx->SetStatus(Status::NotImplemented( - "Casting sliced lists (non-zero offset) not yet implemented")); + if (out->kind() == Datum::SCALAR) { + const auto& in_scalar = checked_cast(*batch[0].scalar()); + auto out_scalar = checked_cast(out->scalar().get()); + + DCHECK(!out_scalar->is_valid); + if (in_scalar.is_valid) { + KERNEL_ASSIGN_OR_RAISE( + out_scalar->value, ctx, + Cast(*in_scalar.value, child_type, options, ctx->exec_context())); + + out_scalar->is_valid = true; + } return; } - // Copy buffers from parent - result->buffers = input.buffers; - auto child_type = checked_cast(*result->type).value_type(); + const ArrayData& in_array = *batch[0].array(); + ArrayData* out_array = out->mutable_array(); + + // Copy from parent + out_array->buffers = in_array.buffers; + Datum values = in_array.child_data[0]; + + if (in_array.offset != 0) { + KERNEL_ASSIGN_OR_RAISE(out_array->buffers[0], ctx, + CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), + in_array.offset, in_array.length)); + KERNEL_ASSIGN_OR_RAISE(out_array->buffers[1], ctx, + ctx->Allocate(sizeof(offset_type) * (in_array.length + 1))); + + auto offsets = in_array.GetValues(1); + auto shifted_offsets = out_array->GetMutableValues(1); + + for (int64_t i = 0; i < in_array.length + 1; ++i) { + shifted_offsets[i] = offsets[i] - offsets[0]; + } + values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); + } + + KERNEL_ASSIGN_OR_RAISE(Datum cast_values, ctx, + Cast(values, child_type, options, ctx->exec_context())); - Datum casted_child; - KERNEL_RETURN_IF_ERROR( - ctx, Cast(Datum(input.child_data[0]), child_type, options, ctx->exec_context()) - .Value(&casted_child)); - DCHECK_EQ(Datum::ARRAY, casted_child.kind()); - result->child_data.push_back(casted_child.array()); + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); } template diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index 4520230f2ae97..77890d27da5e7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -20,6 +20,7 @@ #include "arrow/array/builder_primitive.h" #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/scalar_cast_internal.h" +#include "arrow/compute/kernels/util_internal.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/int_util.h" #include "arrow/util/value_parsing.h" @@ -361,8 +362,7 @@ struct CastFunctor::value>> { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = checked_cast(ctx->state())->options; - const ArrayData& input = *batch[0].array(); - const auto& in_type_inst = checked_cast(*input.type); + const auto& in_type_inst = checked_cast(*batch[0].type()); const auto in_scale = in_type_inst.scale(); if (options.allow_decimal_truncate) { @@ -395,34 +395,34 @@ struct CastFunctor::value>> { struct UnsafeUpscaleDecimal { template Decimal128 Call(KernelContext* ctx, Decimal128 val) const { - return val.IncreaseScaleBy(out_scale_ - in_scale_); + return val.IncreaseScaleBy(by_); } - - int32_t out_scale_, in_scale_; + int32_t by_; }; struct UnsafeDownscaleDecimal { template Decimal128 Call(KernelContext* ctx, Decimal128 val) const { - return val.ReduceScaleBy(in_scale_ - out_scale_, false); + return val.ReduceScaleBy(by_, false); } - - int32_t out_scale_, in_scale_; + int32_t by_; }; struct SafeRescaleDecimal { template Decimal128 Call(KernelContext* ctx, Decimal128 val) const { - auto result = val.Rescale(in_scale_, out_scale_); - if (ARROW_PREDICT_FALSE(!result.ok())) { - ctx->SetStatus(result.status()); - return Decimal128(); // Zero - } else if (ARROW_PREDICT_FALSE(!(*result).FitsInPrecision(out_precision_))) { - ctx->SetStatus(Status::Invalid("Decimal value does not fit in precision")); - return Decimal128(); // Zero - } else { - return *std::move(result); + auto maybe_rescaled = val.Rescale(in_scale_, out_scale_); + if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) { + ctx->SetStatus(maybe_rescaled.status()); + return {}; // Zero } + + if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) { + return maybe_rescaled.MoveValueUnsafe(); + } + + ctx->SetStatus(Status::Invalid("Decimal value does not fit in precision")); + return {}; // Zero } int32_t out_scale_, out_precision_, in_scale_; @@ -432,36 +432,33 @@ template <> struct CastFunctor { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = checked_cast(ctx->state())->options; - const ArrayData& input = *batch[0].array(); - ArrayData* output = out->mutable_array(); - const auto& in_type_inst = checked_cast(*input.type); - const auto& out_type_inst = checked_cast(*output->type); - const auto in_scale = in_type_inst.scale(); - const auto out_scale = out_type_inst.scale(); - const auto out_precision = out_type_inst.precision(); + const auto& in_type = checked_cast(*batch[0].type()); + const auto& out_type = checked_cast(*out->type()); + const auto in_scale = in_type.scale(); + const auto out_scale = out_type.scale(); if (options.allow_decimal_truncate) { if (in_scale < out_scale) { // Unsafe upscale applicator::ScalarUnaryNotNullStateful - kernel(UnsafeUpscaleDecimal{out_scale, in_scale}); + kernel(UnsafeUpscaleDecimal{out_scale - in_scale}); return kernel.Exec(ctx, batch, out); } else { // Unsafe downscale applicator::ScalarUnaryNotNullStateful - kernel(UnsafeDownscaleDecimal{out_scale, in_scale}); + kernel(UnsafeDownscaleDecimal{in_scale - out_scale}); return kernel.Exec(ctx, batch, out); } - } else { - // Safe rescale - applicator::ScalarUnaryNotNullStateful - kernel(SafeRescaleDecimal{out_scale, out_precision, in_scale}); - return kernel.Exec(ctx, batch, out); } + + // Safe rescale + applicator::ScalarUnaryNotNullStateful + kernel(SafeRescaleDecimal{out_scale, out_type.precision(), in_scale}); + return kernel.Exec(ctx, batch, out); } }; @@ -471,15 +468,16 @@ struct CastFunctor { struct RealToDecimal { template Decimal128 Call(KernelContext* ctx, RealType val) const { - auto result = Decimal128::FromReal(val, out_precision_, out_scale_); - if (ARROW_PREDICT_FALSE(!result.ok())) { - if (!allow_truncate_) { - ctx->SetStatus(result.status()); - } - return Decimal128(); // Zero - } else { - return *std::move(result); + auto maybe_decimal = Decimal128::FromReal(val, out_precision_, out_scale_); + + if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) { + return maybe_decimal.MoveValueUnsafe(); } + + if (!allow_truncate_) { + ctx->SetStatus(maybe_decimal.status()); + } + return {}; // Zero } int32_t out_scale_, out_precision_; @@ -490,10 +488,9 @@ template struct CastFunctor::value>> { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = checked_cast(ctx->state())->options; - ArrayData* output = out->mutable_array(); - const auto& out_type_inst = checked_cast(*output->type); - const auto out_scale = out_type_inst.scale(); - const auto out_precision = out_type_inst.precision(); + const auto& out_type = checked_cast(*out->type()); + const auto out_scale = out_type.scale(); + const auto out_precision = out_type.precision(); applicator::ScalarUnaryNotNullStateful kernel( RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate}); @@ -516,9 +513,8 @@ struct DecimalToReal { template struct CastFunctor::value>> { static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& in_type_inst = - checked_cast(*batch[0].array()->type); - const auto in_scale = in_type_inst.scale(); + const auto& in_type = checked_cast(*batch[0].type()); + const auto in_scale = in_type.scale(); applicator::ScalarUnaryNotNullStateful kernel( DecimalToReal{in_scale}); @@ -564,7 +560,7 @@ std::shared_ptr GetCastToInteger(std::string name) { AddCommonNumberCasts(out_ty, func.get()); // From decimal to integer - DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)}, out_ty, + DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty, CastFunctor::Exec)); return func; } @@ -588,7 +584,7 @@ std::shared_ptr GetCastToFloating(std::string name) { AddCommonNumberCasts(out_ty, func.get()); // From decimal to floating point - DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)}, out_ty, + DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty, CastFunctor::Exec)); return func; } @@ -608,8 +604,8 @@ std::shared_ptr GetCastToDecimal128() { // Cast from other decimal auto exec = CastFunctor::Exec; // We resolve the output type of this kernel from the CastOptions - DCHECK_OK(func->AddKernel(Type::DECIMAL128, {InputType::Array(Type::DECIMAL128)}, - sig_out_ty, exec)); + DCHECK_OK( + func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec)); return func; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index b339018072e5d..6f965a46676e5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -94,26 +94,45 @@ struct Utf8Validator { }; template -struct CastBinaryToBinaryOffsets; +void CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input, + ArrayData* output) { + static_assert(std::is_same::value, "Cast same-width offsets (no-op)"); +} -// Cast same-width offsets (no-op) -template <> -struct CastBinaryToBinaryOffsets { - static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) { - } -}; +// Upcast offsets template <> -struct CastBinaryToBinaryOffsets { - static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) { - } -}; +void CastBinaryToBinaryOffsets(KernelContext* ctx, + const ArrayData& input, + ArrayData* output) { + using input_offset_type = int32_t; + using output_offset_type = int64_t; + KERNEL_ASSIGN_OR_RAISE( + output->buffers[1], ctx, + ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type))); + memset(output->buffers[1]->mutable_data(), 0, + output->offset * sizeof(output_offset_type)); + ::arrow::internal::CastInts(input.GetValues(1), + output->GetMutableValues(1), + output->length + 1); +} -// Upcast offsets +// Downcast offsets template <> -struct CastBinaryToBinaryOffsets { - static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) { - using input_offset_type = int32_t; - using output_offset_type = int64_t; +void CastBinaryToBinaryOffsets(KernelContext* ctx, + const ArrayData& input, + ArrayData* output) { + using input_offset_type = int64_t; + using output_offset_type = int32_t; + + constexpr input_offset_type kMaxOffset = std::numeric_limits::max(); + + auto input_offsets = input.GetValues(1); + + // Binary offsets are ascending, so it's enough to check the last one for overflow. + if (input_offsets[input.length] > kMaxOffset) { + ctx->SetStatus(Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + output->type->ToString(), ": input array too large")); + } else { KERNEL_ASSIGN_OR_RAISE(output->buffers[1], ctx, ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type))); @@ -123,66 +142,32 @@ struct CastBinaryToBinaryOffsets { output->GetMutableValues(1), output->length + 1); } -}; +} -// Downcast offsets -template <> -struct CastBinaryToBinaryOffsets { - static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) { - using input_offset_type = int64_t; - using output_offset_type = int32_t; - - constexpr input_offset_type kMaxOffset = - std::numeric_limits::max(); - - auto input_offsets = input.GetValues(1); - - // Binary offsets are ascending, so it's enough to check the last one for overflow. - if (input_offsets[input.length] > kMaxOffset) { - ctx->SetStatus(Status::Invalid("Failed casting from ", input.type->ToString(), - " to ", output->type->ToString(), - ": input array too large")); - } else { - KERNEL_ASSIGN_OR_RAISE(output->buffers[1], ctx, - ctx->Allocate((output->length + output->offset + 1) * - sizeof(output_offset_type))); - memset(output->buffers[1]->mutable_data(), 0, - output->offset * sizeof(output_offset_type)); - ::arrow::internal::CastInts(input.GetValues(1), - output->GetMutableValues(1), - output->length + 1); +template +void BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + DCHECK(out->is_array()); + const CastOptions& options = checked_cast(*ctx->state()).options; + const ArrayData& input = *batch[0].array(); + + if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) { + InitializeUTF8(); + + ArrayDataVisitor visitor; + Utf8Validator validator; + Status st = visitor.Visit(input, &validator); + if (!st.ok()) { + ctx->SetStatus(st); + return; } } -}; -template -struct BinaryToBinaryCastFunctor { - using input_offset_type = typename I::offset_type; - using output_offset_type = typename O::offset_type; + // Start with a zero-copy cast, but change indices to expected size + ZeroCopyCastExec(ctx, batch, out); - static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - DCHECK(out->is_array()); - const CastOptions& options = checked_cast(*ctx->state()).options; - const ArrayData& input = *batch[0].array(); - - if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) { - InitializeUTF8(); - - ArrayDataVisitor visitor; - Utf8Validator validator; - Status st = visitor.Visit(input, &validator); - if (!st.ok()) { - ctx->SetStatus(st); - return; - } - } - - // Start with a zero-copy cast, but change indices to expected size - ZeroCopyCastExec(ctx, batch, out); - CastBinaryToBinaryOffsets::CastOffsets( - ctx, input, out->mutable_array()); - } -}; + CastBinaryToBinaryOffsets( + ctx, input, out->mutable_array()); +} #if defined(_MSC_VER) #pragma warning(pop) @@ -216,7 +201,7 @@ void AddBinaryToBinaryCast(CastFunction* func) { DCHECK_OK(func->AddKernel( InType::type_id, {in_ty}, out_ty, - TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastFunctor::Exec), + TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec), NullHandling::COMPUTED_NO_PREALLOCATE)); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 2a0f44187b28e..99a56346c1b5f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -38,6 +38,7 @@ #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" +#include "arrow/util/bitmap.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" @@ -49,32 +50,30 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; namespace compute { -// Use std::string and Decimal128 for supplying test values for base binary types - -template -struct TestCType { - using type = typename T::c_type; -}; - -template -struct TestCType> { - using type = std::string; -}; - -template -struct TestCType> { - using type = Decimal128; -}; - -static constexpr const char* kInvalidUtf8 = "\xa0\xa1"; +static std::shared_ptr InvalidUtf8(std::shared_ptr type) { + return ArrayFromJSON(type, + "[" + R"( + "Hi", + "olá mundo", + "你好世界", + "", + )" + "\"\xa0\xa1\"" + "]"); +} static std::vector> kNumericTypes = { uint8(), int8(), uint16(), int16(), uint32(), int32(), uint64(), int64(), float32(), float64()}; +static std::vector> kDictionaryIndexTypes = { + int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()}; + static std::vector> kBaseBinaryTypes = { binary(), utf8(), large_binary(), large_utf8()}; @@ -83,330 +82,66 @@ static void AssertBufferSame(const Array& left, const Array& right, int buffer_i right.data()->buffers[buffer_index].get()); } -class TestCast : public TestBase { - public: - void CheckPass(const Array& input, const Array& expected, - const std::shared_ptr& out_type, const CastOptions& options, - bool check_scalar = true, bool validate_full = true) { - ASSERT_OK_AND_ASSIGN(std::shared_ptr result, Cast(input, out_type, options)); - if (validate_full) { - ASSERT_OK(result->ValidateFull()); - } else { - ASSERT_OK(result->Validate()); - } - AssertArraysEqual(expected, *result, /*verbose=*/true); - - if (input.type_id() == Type::DECIMAL || out_type->id() == Type::DECIMAL) { - // ARROW-10835 - check_scalar = false; - } - - if (check_scalar) { - for (int64_t i = 0; i < input.length(); ++i) { - ASSERT_OK_AND_ASSIGN(Datum out, Cast(*input.GetScalar(i), out_type, options)); - AssertScalarsEqual(**expected.GetScalar(i), *out.scalar(), /*verbose=*/true); - } - } - } - - void CheckFails(const Array& input, const std::shared_ptr& out_type, - const CastOptions& options, bool check_scalar = true) { - ASSERT_RAISES(Invalid, Cast(input, out_type, options)); - - if (input.type_id() == Type::DECIMAL || out_type->id() == Type::DECIMAL) { - // ARROW-10835 - check_scalar = false; - } - - // For the scalars, check that at least one of the input fails (since many - // of the tests contains a mix of passing and failing values). In some - // cases we will want to check more precisely - if (check_scalar) { - int64_t num_failing = 0; - for (int64_t i = 0; i < input.length(); ++i) { - auto maybe_out = Cast(*input.GetScalar(i), out_type, options); - num_failing += static_cast(maybe_out.status().IsInvalid()); - } - ASSERT_GT(num_failing, 0); - } - } - - template ::type> - void CheckFails(const std::shared_ptr& in_type, - const std::vector& in_values, const std::vector& is_valid, - const std::shared_ptr& out_type, const CastOptions& options, - bool check_scalar = true) { - std::shared_ptr input; - if (is_valid.size() > 0) { - ArrayFromVector(in_type, is_valid, in_values, &input); - } else { - ArrayFromVector(in_type, in_values, &input); - } - CheckFails(*input, out_type, options, check_scalar); - } - - template ::type> - void CheckFails(const std::vector& in_values, const std::vector& is_valid, - const std::shared_ptr& out_type, const CastOptions& options, - bool check_scalar = true) { - CheckFails(TypeTraits::type_singleton(), in_values, is_valid, - out_type, options, check_scalar); - } - - void CheckZeroCopy(const Array& input, const std::shared_ptr& out_type) { - ASSERT_OK_AND_ASSIGN(std::shared_ptr result, Cast(input, out_type)); - ASSERT_OK(result->ValidateFull()); - ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size()); - for (size_t i = 0; i < input.data()->buffers.size(); ++i) { - AssertBufferSame(input, *result, static_cast(i)); - } - } - - template ::type, - typename O_TYPE = typename TestCType::type> - void CheckCase(const std::shared_ptr& in_type, - const std::vector& in_values, const std::vector& is_valid, - const std::shared_ptr& out_type, - const std::vector& out_values, const CastOptions& options, - bool check_scalar = true, bool validate_full = true) { - ASSERT_EQ(in_values.size(), out_values.size()); - std::shared_ptr input, expected; - if (is_valid.size() > 0) { - ASSERT_EQ(is_valid.size(), out_values.size()); - ArrayFromVector(in_type, is_valid, in_values, &input); - ArrayFromVector(out_type, is_valid, out_values, &expected); - } else { - ArrayFromVector(in_type, in_values, &input); - ArrayFromVector(out_type, out_values, &expected); - } - CheckPass(*input, *expected, out_type, options, check_scalar, validate_full); - - // Check a sliced variant - if (input->length() > 1) { - CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options, check_scalar, - validate_full); - } - } - - template - void CheckCase(const std::vector& in_values, const std::vector& is_valid, - const std::vector& out_values, const CastOptions& options, - bool check_scalar = true, bool validate_full = true) { - CheckCase( - TypeTraits::type_singleton(), in_values, is_valid, - TypeTraits::type_singleton(), out_values, options, check_scalar, - validate_full); - } - - void CheckCaseJSON(const std::shared_ptr& in_type, - const std::shared_ptr& out_type, - const std::string& in_json, const std::string& expected_json, - bool check_scalar = true, - const CastOptions& options = CastOptions()) { - std::shared_ptr input = ArrayFromJSON(in_type, in_json); - std::shared_ptr expected = ArrayFromJSON(out_type, expected_json); - ASSERT_EQ(input->length(), expected->length()); - CheckPass(*input, *expected, out_type, options, check_scalar); - - // Check a sliced variant - if (input->length() > 1) { - CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options, - /*check_scalar=*/false); - } - } - - void CheckFailsJSON(const std::shared_ptr& in_type, - const std::shared_ptr& out_type, - const std::string& in_json, bool check_scalar = true, - const CastOptions& options = CastOptions()) { - std::shared_ptr input = ArrayFromJSON(in_type, in_json); - CheckFails(*input, out_type, options, check_scalar); - } - - template - void TestCastBinaryToBinary() { - CastOptions options; - auto src_type = TypeTraits::type_singleton(); - auto dest_type = TypeTraits::type_singleton(); - - // All valid except the last one - std::vector all = {1, 1, 1, 1, 1}; - std::vector valid = {1, 1, 1, 1, 0}; - std::vector strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; - - // Should accept when invalid but null. - CheckCase(strings, valid, strings, options, - /*check_scalar=*/false); - - // Should accept empty array - CheckCaseJSON(src_type, dest_type, "[]", "[]", /*check_scalar=*/false); - - if (!SourceType::is_utf8 && DestType::is_utf8) { - // Should refuse due to invalid utf8 payload - CheckFails(strings, all, dest_type, options, - /*check_scalar=*/false); - // Should accept due to option override - options.allow_invalid_utf8 = true; - CheckCase(strings, all, strings, options, - /*check_scalar=*/false, /*validate_full=*/false); - } else { - // Destination type allows non-utf8 data, - // or source type also enforces utf8 data. - const bool validate_full = !DestType::is_utf8; - CheckCase(strings, all, strings, options, - /*check_scalar=*/false, validate_full); - } - } - - template - void TestCastNumberToString() { - auto dest_type = TypeTraits::type_singleton(); - - CheckCaseJSON(int8(), dest_type, "[0, 1, 127, -128, null]", - R"(["0", "1", "127", "-128", null])", /*check_scalar=*/false); - CheckCaseJSON(uint8(), dest_type, "[0, 1, 255, null]", R"(["0", "1", "255", null])", - /*check_scalar=*/false); - CheckCaseJSON(int16(), dest_type, "[0, 1, 32767, -32768, null]", - R"(["0", "1", "32767", "-32768", null])", /*check_scalar=*/false); - CheckCaseJSON(uint16(), dest_type, "[0, 1, 65535, null]", - R"(["0", "1", "65535", null])", /*check_scalar=*/false); - CheckCaseJSON(int32(), dest_type, "[0, 1, 2147483647, -2147483648, null]", - R"(["0", "1", "2147483647", "-2147483648", null])", - /*check_scalar=*/false); - CheckCaseJSON(uint32(), dest_type, "[0, 1, 4294967295, null]", - R"(["0", "1", "4294967295", null])", /*check_scalar=*/false); - CheckCaseJSON(int64(), dest_type, - "[0, 1, 9223372036854775807, -9223372036854775808, null]", - R"(["0", "1", "9223372036854775807", "-9223372036854775808", null])", - /*check_scalar=*/false); - CheckCaseJSON(uint64(), dest_type, "[0, 1, 18446744073709551615, null]", - R"(["0", "1", "18446744073709551615", null])", /*check_scalar=*/false); - - CheckCaseJSON(float32(), dest_type, "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]", - R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])", - /*check_scalar=*/false); - CheckCaseJSON(float64(), dest_type, "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]", - R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])", - /*check_scalar=*/false); - } +static void CheckCast(std::shared_ptr input, std::shared_ptr expected, + CastOptions options = CastOptions{}) { + options.to_type = expected->type(); + CheckScalarUnary("cast", input, expected, &options); +} - template - void TestCastBooleanToString() { - auto dest_type = TypeTraits::type_singleton(); +static void CheckCastFails(std::shared_ptr input, CastOptions options) { + ASSERT_RAISES(Invalid, Cast(input, options)) + << "\n to_type: " << options.to_type->ToString() + << "\n input: " << input->ToString(); - CheckCaseJSON(boolean(), dest_type, "[true, true, false, null]", - R"(["true", "true", "false", null])", /*check_scalar=*/false); + if (input->type_id() == Type::EXTENSION) { + // ExtensionScalar not implemented + return; } - template - void TestCastStringToNumber() { - CastOptions options; - auto src_type = TypeTraits::type_singleton(); - - std::vector is_valid = {true, false, true, true, true}; - - // string to int - std::vector v_int = {"0", "1", "127", "-1", "0"}; - std::vector e_int8 = {0, 1, 127, -1, 0}; - std::vector e_int16 = {0, 1, 127, -1, 0}; - std::vector e_int32 = {0, 1, 127, -1, 0}; - std::vector e_int64 = {0, 1, 127, -1, 0}; - CheckCase(v_int, is_valid, e_int8, options); - CheckCase(v_int, is_valid, e_int16, options); - CheckCase(v_int, is_valid, e_int32, options); - CheckCase(v_int, is_valid, e_int64, options); - - v_int = {"2147483647", "0", "-2147483648", "0", "0"}; - e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; - CheckCase(v_int, is_valid, e_int32, options); - v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; - e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; - CheckCase(v_int, is_valid, e_int64, options); - - // string to uint - std::vector v_uint = {"0", "1", "127", "255", "0"}; - std::vector e_uint8 = {0, 1, 127, 255, 0}; - std::vector e_uint16 = {0, 1, 127, 255, 0}; - std::vector e_uint32 = {0, 1, 127, 255, 0}; - std::vector e_uint64 = {0, 1, 127, 255, 0}; - CheckCase(v_uint, is_valid, e_uint8, options); - CheckCase(v_uint, is_valid, e_uint16, options); - CheckCase(v_uint, is_valid, e_uint32, options); - CheckCase(v_uint, is_valid, e_uint64, options); - - v_uint = {"4294967295", "0", "0", "0", "0"}; - e_uint32 = {4294967295, 0, 0, 0, 0}; - CheckCase(v_uint, is_valid, e_uint32, options); - v_uint = {"18446744073709551615", "0", "0", "0", "0"}; - e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; - CheckCase(v_uint, is_valid, e_uint64, options); - - // string to float - std::vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; - std::vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; - std::vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; - CheckCase(v_float, is_valid, e_float, options); - CheckCase(v_float, is_valid, e_double, options); - -#if !defined(_WIN32) || defined(NDEBUG) - // Test that casting is locale-independent - { - // French locale uses the comma as decimal point - LocaleGuard locale_guard("fr_FR.UTF-8"); - CheckCase(v_float, is_valid, e_float, options); - CheckCase(v_float, is_valid, e_double, options); - } -#endif + // For the scalars, check that at least one of the input fails (since many + // of the tests contains a mix of passing and failing values). In some + // cases we will want to check more precisely + int64_t num_failing = 0; + for (int64_t i = 0; i < input->length(); ++i) { + ASSERT_OK_AND_ASSIGN(auto scalar, input->GetScalar(i)); + num_failing += static_cast(Cast(scalar, options).status().IsInvalid()); } + ASSERT_GT(num_failing, 0); +} - template - void TestCastStringToTimestamp() { - CastOptions options; - auto src_type = TypeTraits::type_singleton(); - - std::vector is_valid = {true, false, true}; - std::vector strings = {"1970-01-01", "xxx", "2000-02-29"}; - - auto type = timestamp(TimeUnit::SECOND); - std::vector e = {0, 0, 951782400}; - CheckCase(src_type, strings, is_valid, type, e, options); - - type = timestamp(TimeUnit::MICRO); - e = {0, 0, 951782400000000LL}; - CheckCase(src_type, strings, is_valid, type, e, options); +static void CheckCastZeroCopy(std::shared_ptr input, + std::shared_ptr to_type, + CastOptions options = CastOptions::Safe()) { + ASSERT_OK_AND_ASSIGN(auto converted, Cast(*input, to_type, options)); + ASSERT_OK(converted->ValidateFull()); - // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc + ASSERT_EQ(input->data()->buffers.size(), converted->data()->buffers.size()); + for (size_t i = 0; i < input->data()->buffers.size(); ++i) { + AssertBufferSame(*input, *converted, static_cast(i)); } +} - void TestCastFloatingToDecimal(const std::shared_ptr& in_type) { - auto out_type = decimal(5, 2); - - CheckCaseJSON(in_type, out_type, "[0.0, null, 123.45, 123.456, 999.994]", - R"(["0.00", null, "123.45", "123.46", "999.99"])"); - - // Overflow - CastOptions options{}; - out_type = decimal(5, 2); - CheckFailsJSON(in_type, out_type, "[999.996]", /*check_scalar=*/true, options); - - options.allow_decimal_truncate = true; - CheckCaseJSON(in_type, out_type, "[0.0, null, 999.996, 123.45, 999.994]", - R"(["0.00", null, "0.00", "123.45", "999.99"])", /*check_scalar=*/true, - options); +static std::shared_ptr MaskArrayWithNullsAt(std::shared_ptr input, + std::vector indices_to_mask) { + auto masked = input->data()->Copy(); + masked->buffers[0] = *AllocateEmptyBitmap(input->length()); + masked->null_count = kUnknownNullCount; + + using arrow::internal::Bitmap; + Bitmap is_valid(masked->buffers[0], 0, input->length()); + if (auto original = input->null_bitmap()) { + is_valid.CopyFrom(Bitmap(original, input->offset(), input->length())); + } else { + is_valid.SetBitsTo(true); } - void TestCastDecimalToFloating(const std::shared_ptr& out_type) { - auto in_type = decimal(5, 2); - - CheckCaseJSON(in_type, out_type, R"(["0.00", null, "123.45", "999.99"])", - "[0.0, null, 123.45, 999.99]"); - // Edge cases are tested in Decimal128::ToReal() + for (int i : indices_to_mask) { + is_valid.SetBitTo(i, false); } -}; + return MakeArray(masked); +} -TEST_F(TestCast, CanCast) { +TEST(Cast, CanCast) { auto ExpectCanCast = [](std::shared_ptr from, std::vector> to_set, bool expected = true) { @@ -475,7 +210,7 @@ TEST_F(TestCast, CanCast) { ExpectCannotCast(null(), {smallint()}); // FIXME missing common cast from null } -TEST_F(TestCast, SameTypeZeroCopy) { +TEST(Cast, SameTypeZeroCopy) { std::shared_ptr arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]"); ASSERT_OK_AND_ASSIGN(std::shared_ptr result, Cast(*arr, int32())); @@ -483,7 +218,7 @@ TEST_F(TestCast, SameTypeZeroCopy) { AssertBufferSame(*arr, *result, 1); } -TEST_F(TestCast, ZeroChunks) { +TEST(Cast, ZeroChunks) { auto chunked_i32 = std::make_shared(ArrayVector{}, int32()); ASSERT_OK_AND_ASSIGN(Datum result, Cast(chunked_i32, utf8())); @@ -491,1033 +226,806 @@ TEST_F(TestCast, ZeroChunks) { AssertChunkedEqual(*result.chunked_array(), ChunkedArray({}, utf8())); } -TEST_F(TestCast, CastDoesNotProvideDefaultOptions) { +TEST(Cast, CastDoesNotProvideDefaultOptions) { std::shared_ptr arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]"); ASSERT_RAISES(Invalid, CallFunction("cast", {arr})); } -TEST_F(TestCast, FromBoolean) { - CastOptions options; - - std::vector is_valid(20, true); - is_valid[3] = false; - - std::vector v1(is_valid.size(), true); - std::vector e1(is_valid.size(), 1); - for (size_t i = 0; i < v1.size(); ++i) { - if (i % 3 == 1) { - v1[i] = false; - e1[i] = 0; - } - } - - CheckCase(v1, is_valid, e1, options); +TEST(Cast, FromBoolean) { + std::string vals = "[1, 0, null, 1, 0, 1, 1, null, 0, 0, 1]"; + CheckCast(ArrayFromJSON(boolean(), vals), ArrayFromJSON(int32(), vals)); } -TEST_F(TestCast, ToBoolean) { - CastOptions options; +TEST(Cast, ToBoolean) { for (auto type : kNumericTypes) { - CheckCaseJSON(type, boolean(), "[0, null, 127, 1, 0]", - "[false, null, true, true, false]"); + CheckCast(ArrayFromJSON(type, "[0, null, 127, 1, 0]"), + ArrayFromJSON(boolean(), "[false, null, true, true, false]")); } // Check negative numbers - CheckCaseJSON(int8(), boolean(), "[0, null, 127, -1, 0]", - "[false, null, true, true, false]"); - CheckCaseJSON(float64(), boolean(), "[0, null, 127, -1, 0]", - "[false, null, true, true, false]"); + for (auto type : {int8(), float64()}) { + CheckCast(ArrayFromJSON(type, "[0, null, 127, -1, 0]"), + ArrayFromJSON(boolean(), "[false, null, true, true, false]")); + } } -TEST_F(TestCast, ToIntUpcast) { - CastOptions options; - options.allow_int_overflow = false; - +TEST(Cast, ToIntUpcast) { std::vector is_valid = {true, false, true, true, true}; // int8 to int32 - std::vector v1 = {0, 1, 127, -1, 0}; - std::vector e1 = {0, 1, 127, -1, 0}; - CheckCase(v1, is_valid, e1, options); - - // bool to int8 - std::vector v2 = {false, true, false, true, true}; - std::vector e2 = {0, 1, 0, 1, 1}; - CheckCase(v2, is_valid, e2, options); + CheckCast(ArrayFromJSON(int8(), "[0, null, 127, -1, 0]"), + ArrayFromJSON(int32(), "[0, null, 127, -1, 0]")); // uint8 to int16, no overflow/underrun - std::vector v3 = {0, 100, 200, 255, 0}; - std::vector e3 = {0, 100, 200, 255, 0}; - CheckCase(v3, is_valid, e3, options); + CheckCast(ArrayFromJSON(uint8(), "[0, 100, 200, 255, 0]"), + ArrayFromJSON(int16(), "[0, 100, 200, 255, 0]")); } -TEST_F(TestCast, OverflowInNullSlot) { - CastOptions options; - options.allow_int_overflow = false; - - std::vector is_valid = {true, false, true, true, true}; - - std::vector v11 = {0, 70000, 2000, 1000, 0}; - std::vector e11 = {0, 0, 2000, 1000, 0}; - - std::shared_ptr expected; - ArrayFromVector(int16(), is_valid, e11, &expected); - - auto buf = Buffer::Wrap(v11.data(), v11.size()); - Int32Array tmp11(5, buf, expected->null_bitmap(), -1); - - CheckPass(tmp11, *expected, int16(), options); +TEST(Cast, OverflowInNullSlot) { + CheckCast( + MaskArrayWithNullsAt(ArrayFromJSON(int32(), "[0, 87654321, 2000, 1000, 0]"), {1}), + ArrayFromJSON(int16(), "[0, null, 2000, 1000, 0]")); } -TEST_F(TestCast, ToIntDowncastSafe) { - CastOptions options; - options.allow_int_overflow = false; - - std::vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100, 200, 1, 2}; - CheckCase(v1, is_valid, e1, options); - - // int16 to uint8, with overflow - std::vector v2 = {0, 100, 256, 0, 0}; - CheckFails(v2, is_valid, uint8(), options); +TEST(Cast, ToIntDowncastSafe) { + // int16 to uint8, no overflow/underflow + CheckCast(ArrayFromJSON(int16(), "[0, null, 200, 1, 2]"), + ArrayFromJSON(uint8(), "[0, null, 200, 1, 2]")); - // underflow - std::vector v3 = {0, 100, -1, 0, 0}; - CheckFails(v3, is_valid, uint8(), options); + // int16 to uint8, overflow + CheckCastFails(ArrayFromJSON(int16(), "[0, null, 256, 0, 0]"), + CastOptions::Safe(uint8())); + // ... and underflow + CheckCastFails(ArrayFromJSON(int16(), "[0, null, -1, 0, 0]"), + CastOptions::Safe(uint8())); - // int32 to int16, no overflow - std::vector v4 = {0, 1000, 2000, 1, 2}; - std::vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(v4, is_valid, e4, options); + // int32 to int16, no overflow/underflow + CheckCast(ArrayFromJSON(int32(), "[0, null, 2000, 1, 2]"), + ArrayFromJSON(int16(), "[0, null, 2000, 1, 2]")); // int32 to int16, overflow - std::vector v5 = {0, 1000, 2000, 70000, 0}; - CheckFails(v5, is_valid, int16(), options); - - // underflow - std::vector v6 = {0, 1000, 2000, -70000, 0}; - CheckFails(v6, is_valid, int16(), options); - - std::vector v7 = {0, 1000, 2000, -70000, 0}; - CheckFails(v7, is_valid, uint8(), options); -} - -template -std::vector UnsafeVectorCast(const std::vector& v) { - size_t n_elems = v.size(); - std::vector result(n_elems); + CheckCastFails(ArrayFromJSON(int32(), "[0, null, 2000, 70000, 2]"), + CastOptions::Safe(int16())); - for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); + // ... and underflow + CheckCastFails(ArrayFromJSON(int32(), "[0, null, 2000, -70000, 2]"), + CastOptions::Safe(int16())); - return result; + CheckCastFails(ArrayFromJSON(int32(), "[0, null, 2000, -70000, 2]"), + CastOptions::Safe(uint8())); } -TEST_F(TestCast, IntegerSignedToUnsigned) { - CastOptions options; - options.allow_int_overflow = false; - - std::vector is_valid = {true, false, true, true, true}; - - std::vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; - +TEST(Cast, IntegerSignedToUnsigned) { + auto i32s = ArrayFromJSON(int32(), "[-2147483648, null, -1, 65535, 2147483647]"); // Same width - CheckFails(v1, is_valid, uint32(), options); + CheckCastFails(i32s, CastOptions::Safe(uint32())); // Wider - CheckFails(v1, is_valid, uint64(), options); + CheckCastFails(i32s, CastOptions::Safe(uint64())); // Narrower - CheckFails(v1, is_valid, uint16(), options); - // Fail because of overflow (instead of underflow). - std::vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; - CheckFails(over, is_valid, uint16(), options); + CheckCastFails(i32s, CastOptions::Safe(uint16())); + CastOptions options; options.allow_int_overflow = true; - CheckCase(v1, is_valid, UnsafeVectorCast(v1), - options); - CheckCase(v1, is_valid, UnsafeVectorCast(v1), - options); - CheckCase(v1, is_valid, UnsafeVectorCast(v1), - options); - CheckCase(over, is_valid, - UnsafeVectorCast(over), options); -} + CheckCast(i32s, + ArrayFromJSON(uint32(), "[2147483648, null, 4294967295, 65535, 2147483647]"), + options); + CheckCast(i32s, + ArrayFromJSON( + uint64(), + "[18446744071562067968, null, 18446744073709551615, 65535, 2147483647]"), + options); + CheckCast(i32s, ArrayFromJSON(uint16(), "[0, null, 65535, 65535, 65535]"), options); -TEST_F(TestCast, IntegerUnsignedToSigned) { - CastOptions options; - options.allow_int_overflow = false; + // Fail because of overflow (instead of underflow). + i32s = ArrayFromJSON(int32(), "[0, null, 0, 65536, 2147483647]"); + CheckCastFails(i32s, CastOptions::Safe(uint16())); - std::vector is_valid = {true, true, true}; + CheckCast(i32s, ArrayFromJSON(uint16(), "[0, null, 0, 0, 65535]"), options); +} - std::vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; - std::vector v2 = {0, INT16_MAX + 1, 2}; +TEST(Cast, IntegerUnsignedToSigned) { + auto u32s = ArrayFromJSON(uint32(), "[4294967295, null, 0, 32768]"); // Same width - CheckFails(v1, is_valid, int32(), options); + CheckCastFails(u32s, CastOptions::Safe(int32())); + // Narrower - CheckFails(v1, is_valid, int16(), options); - CheckFails(v2, is_valid, int16(), options); + CheckCastFails(u32s, CastOptions::Safe(int16())); + CheckCastFails(u32s->Slice(1), CastOptions::Safe(int16())); + CastOptions options; options.allow_int_overflow = true; - CheckCase(v1, is_valid, UnsafeVectorCast(v1), - options); - CheckCase(v1, is_valid, UnsafeVectorCast(v1), - options); - CheckCase(v1, is_valid, UnsafeVectorCast(v1), - options); - CheckCase(v2, is_valid, UnsafeVectorCast(v2), - options); + CheckCast(u32s, ArrayFromJSON(int32(), "[-1, null, 0, 32768]"), options); + CheckCast(u32s, ArrayFromJSON(int64(), "[4294967295, null, 0, 32768]"), options); + CheckCast(u32s, ArrayFromJSON(int16(), "[-1, null, 0, -32768]"), options); } -TEST_F(TestCast, ToIntDowncastUnsafe) { +TEST(Cast, ToIntDowncastUnsafe) { CastOptions options; options.allow_int_overflow = true; - std::vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100, 200, 1, 2}; - CheckCase(v1, is_valid, e1, options); - - // int16 to uint8, with overflow - std::vector v2 = {0, 100, 256, 0, 0}; - std::vector e2 = {0, 100, 0, 0, 0}; - CheckCase(v2, is_valid, e2, options); + // int16 to uint8, no overflow/underflow + CheckCast(ArrayFromJSON(int16(), "[0, null, 200, 1, 2]"), + ArrayFromJSON(uint8(), "[0, null, 200, 1, 2]"), options); - // underflow - std::vector v3 = {0, 100, -1, 0, 0}; - std::vector e3 = {0, 100, 255, 0, 0}; - CheckCase(v3, is_valid, e3, options); + // int16 to uint8, with overflow/underflow + CheckCast(ArrayFromJSON(int16(), "[0, null, 256, 1, 2, -1]"), + ArrayFromJSON(uint8(), "[0, null, 0, 1, 2, 255]"), options); - // int32 to int16, no overflow - std::vector v4 = {0, 1000, 2000, 1, 2}; - std::vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(v4, is_valid, e4, options); + // int32 to int16, no overflow/underflow + CheckCast(ArrayFromJSON(int32(), "[0, null, 2000, 1, 2, -1]"), + ArrayFromJSON(int16(), "[0, null, 2000, 1, 2, -1]"), options); - // int32 to int16, overflow - // TODO(wesm): do we want to allow this? we could set to null - std::vector v5 = {0, 1000, 2000, 70000, 0}; - std::vector e5 = {0, 1000, 2000, 4464, 0}; - CheckCase(v5, is_valid, e5, options); - - // underflow - // TODO(wesm): do we want to allow this? we could set overflow to null - std::vector v6 = {0, 1000, 2000, -70000, 0}; - std::vector e6 = {0, 1000, 2000, -4464, 0}; - CheckCase(v6, is_valid, e6, options); + // int32 to int16, with overflow/underflow + CheckCast(ArrayFromJSON(int32(), "[0, null, 2000, 70000, -70000]"), + ArrayFromJSON(int16(), "[0, null, 2000, 4464, -4464]"), options); } -TEST_F(TestCast, FloatingPointToInt) { - // which means allow_float_truncate == false - auto options = CastOptions::Safe(); - - std::vector is_valid = {true, false, true, true, true}; - std::vector all_valid = {true, true, true, true, true}; - - // float32 to int32 no truncation - std::vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; - std::vector e1 = {1, 0, 0, -1, 5}; - CheckCase(v1, is_valid, e1, options); - CheckCase(v1, all_valid, e1, options); - - // float64 to int32 no truncation - std::vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; - std::vector e2 = {1, 0, 0, -1, 5}; - CheckCase(v2, is_valid, e2, options); - CheckCase(v2, all_valid, e2, options); - - // float64 to int64 no truncation - std::vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; - std::vector e3 = {1, 0, 0, -1, 5}; - CheckCase(v3, is_valid, e3, options); - CheckCase(v3, all_valid, e3, options); - - // float64 to int32 truncate - std::vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; - std::vector e4 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(v4, is_valid, int32(), options); - CheckFails(v4, all_valid, int32(), options); - - options.allow_float_truncate = true; - CheckCase(v4, is_valid, e4, options); - CheckCase(v4, all_valid, e4, options); - - // float64 to int64 truncate - std::vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; - std::vector e5 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(v5, is_valid, int64(), options); - CheckFails(v5, all_valid, int64(), options); - - options.allow_float_truncate = true; - CheckCase(v5, is_valid, e5, options); - CheckCase(v5, all_valid, e5, options); +TEST(Cast, FloatingToInt) { + for (auto from : {float32(), float64()}) { + for (auto to : {int32(), int64()}) { + // float to int no truncation + CheckCast(ArrayFromJSON(from, "[1.0, null, 0.0, -1.0, 5.0]"), + ArrayFromJSON(to, "[1, null, 0, -1, 5]")); + + // float to int truncate error + auto opts = CastOptions::Safe(to); + CheckCastFails(ArrayFromJSON(from, "[1.5, 0.0, null, 0.5, -1.5, 5.5]"), opts); + + // float to int truncate allowed + opts.allow_float_truncate = true; + CheckCast(ArrayFromJSON(from, "[1.5, 0.0, null, 0.5, -1.5, 5.5]"), + ArrayFromJSON(to, "[1, 0, null, 0, -1, 5]"), opts); + } + } } -TEST_F(TestCast, IntToFloatingPoint) { - auto options = CastOptions::Safe(); - - std::vector all_valid = {true, true, true, true, true}; - std::vector all_invalid = {false, false, false, false, false}; - - std::vector u32_v1 = {1LL << 24, (1LL << 24) + 1}; - CheckFails(u32_v1, {true, true}, float32(), options); +TEST(Cast, IntToFloating) { + for (auto from : {uint32(), int32()}) { + std::string two_24 = "[16777216, 16777217]"; - std::vector u32_v2 = {1LL << 24, 1LL << 24}; - CheckCase(u32_v2, {true, true}, - UnsafeVectorCast(u32_v2), options); + CheckCastFails(ArrayFromJSON(from, two_24), CastOptions::Safe(float32())); - std::vector i32_v1 = {1LL << 24, (1LL << 24) + 1}; - std::vector i32_v2 = {1LL << 24, 1LL << 24}; - CheckFails(i32_v1, {true, true}, float32(), options); - CheckCase(i32_v2, {true, true}, - UnsafeVectorCast(i32_v2), options); + CheckCast(ArrayFromJSON(from, two_24)->Slice(0, 1), + ArrayFromJSON(float32(), two_24)->Slice(0, 1)); + } - std::vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; - CheckFails(v1, all_valid, float64(), options); + auto i64s = ArrayFromJSON(int64(), + "[-9223372036854775808, -9223372036854775807, 0," + " 9223372036854775806, 9223372036854775807]"); + CheckCastFails(i64s, CastOptions::Safe(float64())); - // While it's not safe to convert, all values are null. - CheckCase(v1, all_invalid, UnsafeVectorCast(v1), - options); + // Masking those values with nulls makes this safe + CheckCast(MaskArrayWithNullsAt(i64s, {0, 1, 3, 4}), + ArrayFromJSON(float64(), "[null, null, 0, null, null]")); - CheckFails({1LL << 53, (1LL << 53) + 1}, {true, true}, float64(), options); + CheckCastFails(ArrayFromJSON(uint64(), "[9007199254740992, 9007199254740993]"), + CastOptions::Safe(float64())); } -TEST_F(TestCast, DecimalToInt) { - CastOptions options; - std::vector is_valid2 = {true, true}; - std::vector is_valid3 = {true, true, false}; - - // no overflow no truncation - std::vector v12 = {Decimal128("02.0000000000"), - Decimal128("-11.0000000000")}; - std::vector v13 = {Decimal128("02.0000000000"), - Decimal128("-11.0000000000"), - Decimal128("-12.0000000000")}; - std::vector e12 = {2, -11}; - std::vector e13 = {2, -11, 0}; +TEST(Cast, DecimalToInt) { + auto options = CastOptions::Safe(int64()); for (bool allow_int_overflow : {false, true}) { for (bool allow_decimal_truncate : {false, true}) { options.allow_int_overflow = allow_int_overflow; options.allow_decimal_truncate = allow_decimal_truncate; - CheckCase(decimal(38, 10), v12, is_valid2, int64(), e12, - options); - CheckCase(decimal(38, 10), v13, is_valid3, int64(), e13, - options); + + auto no_overflow_no_truncation = ArrayFromJSON(decimal(38, 10), R"([ + "02.0000000000", + "-11.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + CheckCast(no_overflow_no_truncation, + ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options); } } - // truncation, no overflow - std::vector v22 = {Decimal128("02.1000000000"), - Decimal128("-11.0000004500")}; - std::vector v23 = {Decimal128("02.1000000000"), - Decimal128("-11.0000004500"), - Decimal128("-12.0000004500")}; - std::vector e22 = {2, -11}; - std::vector e23 = {2, -11, 0}; - for (bool allow_int_overflow : {false, true}) { options.allow_int_overflow = allow_int_overflow; + auto truncation_but_no_overflow = ArrayFromJSON(decimal(38, 10), R"([ + "02.1000000000", + "-11.0000004500", + "22.0000004500", + "-121.1210000000", + null])"); + options.allow_decimal_truncate = true; - CheckCase(decimal(38, 10), v22, is_valid2, int64(), e22, - options); - CheckCase(decimal(38, 10), v23, is_valid3, int64(), e23, - options); + CheckCast(truncation_but_no_overflow, + ArrayFromJSON(int64(), "[2, -11, 22, -121, null]"), options); + options.allow_decimal_truncate = false; - CheckFails(decimal(38, 10), v22, is_valid2, int64(), options); - CheckFails(decimal(38, 10), v23, is_valid3, int64(), options); + CheckCastFails(truncation_but_no_overflow, options); } - // overflow, no truncation - std::vector v32 = {Decimal128("12345678901234567890000.0000000000"), - Decimal128("99999999999999999999999.0000000000")}; - std::vector v33 = {Decimal128("12345678901234567890000.0000000000"), - Decimal128("99999999999999999999999.0000000000"), - Decimal128("99999999999999999999999.0000000000")}; - // 12345678901234567890000 % 2**64, 99999999999999999999999 % 2**64 - std::vector e32 = {4807115922877858896, 200376420520689663}; - std::vector e33 = {4807115922877858896, 200376420520689663, -2}; - for (bool allow_decimal_truncate : {false, true}) { options.allow_decimal_truncate = allow_decimal_truncate; + + auto overflow_no_truncation = ArrayFromJSON(decimal(38, 10), R"([ + "12345678901234567890000.0000000000", + "99999999999999999999999.0000000000", + null])"); + options.allow_int_overflow = true; - CheckCase(decimal(38, 10), v32, is_valid2, int64(), e32, - options); - CheckCase(decimal(38, 10), v33, is_valid3, int64(), e33, - options); + CheckCast( + overflow_no_truncation, + ArrayFromJSON(int64(), + // 12345678901234567890000 % 2**64, 99999999999999999999999 % 2**64 + "[4807115922877858896, 200376420520689663, null]"), + options); + options.allow_int_overflow = false; - CheckFails(decimal(38, 10), v32, is_valid2, int64(), options); - CheckFails(decimal(38, 10), v33, is_valid3, int64(), options); + CheckCastFails(overflow_no_truncation, options); } - // overflow, truncation - std::vector v42 = {Decimal128("12345678901234567890000.0045345000"), - Decimal128("99999999999999999999999.0000005430")}; - std::vector v43 = {Decimal128("12345678901234567890000.0005345340"), - Decimal128("99999999999999999999999.0000344300"), - Decimal128("99999999999999999999999.0004354000")}; - // 12345678901234567890000 % 2**64, 99999999999999999999999 % 2**64 - std::vector e42 = {4807115922877858896, 200376420520689663}; - std::vector e43 = {4807115922877858896, 200376420520689663, -2}; - for (bool allow_int_overflow : {false, true}) { for (bool allow_decimal_truncate : {false, true}) { options.allow_int_overflow = allow_int_overflow; options.allow_decimal_truncate = allow_decimal_truncate; + + auto overflow_and_truncation = ArrayFromJSON(decimal(38, 10), R"([ + "12345678901234567890000.0045345000", + "99999999999999999999999.0000344300", + null])"); + if (options.allow_int_overflow && options.allow_decimal_truncate) { - CheckCase(decimal(38, 10), v42, is_valid2, int64(), - e42, options); - CheckCase(decimal(38, 10), v43, is_valid3, int64(), - e43, options); + CheckCast(overflow_and_truncation, + ArrayFromJSON( + int64(), + // 12345678901234567890000 % 2**64, 99999999999999999999999 % 2**64 + "[4807115922877858896, 200376420520689663, null]"), + options); } else { - CheckFails(decimal(38, 10), v42, is_valid2, int64(), options); - CheckFails(decimal(38, 10), v43, is_valid3, int64(), options); + CheckCastFails(overflow_and_truncation, options); } } } - // negative scale - std::vector v5 = {Decimal128("1234567890000."), Decimal128("-120000.")}; - for (int i = 0; i < 2; i++) v5[i] = v5[i].Rescale(0, -4).ValueOrDie(); - std::vector e5 = {1234567890000, -120000}; - CheckCase(decimal(38, -4), v5, is_valid2, int64(), e5, - options); + Decimal128Builder builder(decimal(38, -4)); + for (auto d : {Decimal128("1234567890000."), Decimal128("-120000.")}) { + ASSERT_OK_AND_ASSIGN(d, d.Rescale(0, -4)); + ASSERT_OK(builder.Append(d)); + } + ASSERT_OK_AND_ASSIGN(auto negative_scale, builder.Finish()); + options.allow_int_overflow = true; + options.allow_decimal_truncate = true; + CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options); } -TEST_F(TestCast, DecimalToDecimal) { +TEST(Cast, DecimalToDecimal) { CastOptions options; - std::vector is_valid1 = {true}; - std::vector is_valid2 = {true, true}; - std::vector is_valid3 = {true, true, false}; - - // Non-truncating - - std::vector v12 = {Decimal128("02.0000000000"), - Decimal128("30.0000000000")}; - std::vector e12 = {Decimal128("02."), Decimal128("30.")}; - std::vector v13 = {Decimal128("02.0000000000"), Decimal128("30.0000000000"), - Decimal128("30.0000000000")}; - std::vector e13 = {Decimal128("02."), Decimal128("30."), Decimal128("-1.")}; - for (bool allow_decimal_truncate : {false, true}) { options.allow_decimal_truncate = allow_decimal_truncate; - CheckCase(decimal(38, 10), v12, is_valid2, - decimal(28, 0), e12, options); - CheckCase(decimal(38, 10), v13, is_valid3, - decimal(28, 0), e13, options); - // and back - CheckCase(decimal(28, 0), e12, is_valid2, - decimal(38, 10), v12, options); - CheckCase(decimal(28, 0), e13, is_valid3, - decimal(38, 10), v13, options); + + auto no_truncation = ArrayFromJSON(decimal(38, 10), R"([ + "02.0000000000", + "30.0000000000", + "22.0000000000", + "-121.0000000000", + null])"); + auto expected = ArrayFromJSON(decimal(28, 0), R"([ + "02.", + "30.", + "22.", + "-121.", + null])"); + + CheckCast(no_truncation, expected, options); + CheckCast(expected, no_truncation, options); } - // Same scale, different precision - std::vector v14 = {Decimal128("12.34"), Decimal128("0.56")}; for (bool allow_decimal_truncate : {false, true}) { options.allow_decimal_truncate = allow_decimal_truncate; - CheckCase(decimal(5, 2), v14, is_valid2, - decimal(4, 2), v14, options); - // and back - CheckCase(decimal(4, 2), v14, is_valid2, - decimal(5, 2), v14, options); - } - auto check_truncate = [this](const std::shared_ptr& input_type, - const std::vector& input, - const std::vector& is_valid, - const std::shared_ptr& output_type, - const std::vector& expected_output) { - CastOptions options; + // Same scale, different precision + auto d_5_2 = ArrayFromJSON(decimal(5, 2), R"([ + "12.34", + "0.56"])"); + auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"([ + "12.34", + "0.56"])"); - options.allow_decimal_truncate = true; - CheckCase(input_type, input, is_valid, output_type, - expected_output, options); - options.allow_decimal_truncate = false; - CheckFails(input_type, input, is_valid, output_type, options); - }; + CheckCast(d_5_2, d_4_2, options); + CheckCast(d_4_2, d_5_2, options); + } - auto check_truncate_and_back = - [this](const std::shared_ptr& input_type, - const std::vector& input, const std::vector& is_valid, - const std::shared_ptr& output_type, - const std::vector& expected_output, - const std::vector& expected_back_convert) { - CastOptions options; - - options.allow_decimal_truncate = true; - CheckCase(input_type, input, is_valid, - output_type, expected_output, options); - // and back - CheckCase(output_type, expected_output, is_valid, - input_type, expected_back_convert, - options); - - options.allow_decimal_truncate = false; - CheckFails(input_type, input, is_valid, output_type, options); - // back case is valid - CheckCase(output_type, expected_output, is_valid, - input_type, expected_back_convert, - options); - }; - - // Rescale leads to truncation - - std::vector v22 = {Decimal128("-02.1234567890"), - Decimal128("30.1234567890")}; - std::vector e22 = {Decimal128("-02."), Decimal128("30.")}; - std::vector f22 = {Decimal128("-02.0000000000"), - Decimal128("30.0000000000")}; - std::vector v23 = {Decimal128("-02.1234567890"), - Decimal128("30.1234567890"), - Decimal128("30.1234567890")}; - std::vector e23 = {Decimal128("-02."), Decimal128("30."), - Decimal128("-70.")}; - std::vector f23 = {Decimal128("-02.0000000000"), - Decimal128("30.0000000000"), - Decimal128("80.0000000000")}; - - check_truncate_and_back(decimal(38, 10), v22, is_valid2, decimal(28, 0), e22, f22); - check_truncate_and_back(decimal(38, 10), v23, is_valid3, decimal(28, 0), e23, f23); + auto d_38_10 = ArrayFromJSON(decimal(38, 10), R"([ + "-02.1234567890", + "30.1234567890", + null])"); - // Precision loss without rescale leads to truncation + auto d_28_0 = ArrayFromJSON(decimal(28, 0), R"([ + "-02.", + "30.", + null])"); - std::vector v3 = {Decimal128("12.34")}; - std::vector e3 = {Decimal128("12.34")}; + auto d_38_10_roundtripped = ArrayFromJSON(decimal(38, 10), R"([ + "-02.0000000000", + "30.0000000000", + null])"); - check_truncate(decimal(4, 2), v3, is_valid1, decimal(3, 2), e3); + // Rescale which leads to truncation + options.allow_decimal_truncate = true; + CheckCast(d_38_10, d_28_0, options); + CheckCast(d_28_0, d_38_10_roundtripped, options); - // Precision loss with rescale leads to truncation + options.allow_decimal_truncate = false; + options.to_type = d_28_0->type(); + CheckCastFails(d_38_10, options); + CheckCast(d_28_0, d_38_10_roundtripped, options); - std::vector v4 = {Decimal128("12.34")}; - std::vector e4 = {Decimal128("12.340")}; - std::vector v5 = {Decimal128("12.34")}; - std::vector e5 = {Decimal128("12.3")}; + // Precision loss without rescale leads to truncation + auto d_4_2 = ArrayFromJSON(decimal(4, 2), R"(["12.34"])"); + for (auto expected : { + ArrayFromJSON(decimal(3, 2), R"(["12.34"])"), + ArrayFromJSON(decimal(4, 3), R"(["12.340"])"), + ArrayFromJSON(decimal(2, 1), R"(["12.3"])"), + }) { + options.allow_decimal_truncate = true; + CheckCast(d_4_2, expected, options); - check_truncate(decimal(4, 2), v4, is_valid1, decimal(4, 3), e4); - check_truncate(decimal(4, 2), v5, is_valid1, decimal(2, 1), e5); + options.allow_decimal_truncate = false; + options.to_type = expected->type(); + CheckCastFails(d_4_2, options); + } } -TEST_F(TestCast, FloatToDecimal) { - auto in_type = float32(); +TEST(Cast, FloatingToDecimal) { + for (auto float_type : {float32(), float64()}) { + CheckCast( + ArrayFromJSON(float_type, "[0.0, null, 123.45, 123.456, 999.994]"), + ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "123.45", "123.46", "999.99"])")); - TestCastFloatingToDecimal(in_type); + // Overflow + CastOptions options; + options.to_type = decimal(5, 2); + CheckCastFails(ArrayFromJSON(float_type, "[999.996]"), options); + + options.allow_decimal_truncate = true; + CheckCast( + ArrayFromJSON(float_type, "[0.0, null, 999.996, 123.45, 999.994]"), + ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "0.00", "123.45", "999.99"])"), + options); + } // 2**64 + 2**41 (exactly representable as a float) - auto out_type = decimal(20, 0); - CheckCaseJSON(in_type, out_type, "[1.8446746e+19, -1.8446746e+19]", - R"(["18446746272732807168", "-18446746272732807168"])"); - out_type = decimal(20, 4); - CheckCaseJSON(in_type, out_type, "[1.8446746e+15, -1.8446746e+15]", - R"(["1844674627273280.7168", "-1844674627273280.7168"])"); - - // More edge cases tested in Decimal128::FromReal -} + CheckCast(ArrayFromJSON(float32(), "[1.8446746e+19, -1.8446746e+19]"), + ArrayFromJSON(decimal(20, 0), + R"(["18446746272732807168", "-18446746272732807168"])")); -TEST_F(TestCast, DoubleToDecimal) { - auto in_type = float64(); + CheckCast(ArrayFromJSON(float32(), "[1.8446746e+15, -1.8446746e+15]"), + ArrayFromJSON(decimal(20, 4), + R"(["1844674627273280.7168", "-1844674627273280.7168"])")); - TestCastFloatingToDecimal(in_type); + CheckCast(ArrayFromJSON(float64(), "[1.8446744073709556e+19, -1.8446744073709556e+19]"), + ArrayFromJSON(decimal(20, 0), + R"(["18446744073709555712", "-18446744073709555712"])")); - // 2**64 + 2**11 (exactly representable as a double) - auto out_type = decimal(20, 0); - CheckCaseJSON(in_type, out_type, "[1.8446744073709556e+19, -1.8446744073709556e+19]", - R"(["18446744073709555712", "-18446744073709555712"])"); - out_type = decimal(20, 4); - CheckCaseJSON(in_type, out_type, "[1.8446744073709556e+15, -1.8446744073709556e+15]", - R"(["1844674407370955.5712", "-1844674407370955.5712"])"); + CheckCast(ArrayFromJSON(float64(), "[1.8446744073709556e+15, -1.8446744073709556e+15]"), + ArrayFromJSON(decimal(20, 4), + R"(["1844674407370955.5712", "-1844674407370955.5712"])")); - // More edge cases tested in Decimal128::FromReal + // Edge cases are tested for Decimal128::FromReal() } -TEST_F(TestCast, DecimalToFloat) { - auto out_type = float32(); - TestCastDecimalToFloating(out_type); -} +TEST(Cast, DecimalToFloating) { + for (auto float_type : {float32(), float64()}) { + CheckCast(ArrayFromJSON(decimal(5, 2), R"(["0.00", null, "123.45", "999.99"])"), + ArrayFromJSON(float_type, "[0.0, null, 123.45, 999.99]")); + } -TEST_F(TestCast, DecimalToDouble) { - auto out_type = float64(); - TestCastDecimalToFloating(out_type); + // Edge cases are tested for Decimal128::ToReal() } -TEST_F(TestCast, TimestampToTimestamp) { +TEST(Cast, TimestampToTimestamp) { + struct TimestampTypePair { + std::shared_ptr coarse, fine; + }; + CastOptions options; - auto CheckTimestampCast = [this](const CastOptions& options, TimeUnit::type from_unit, - TimeUnit::type to_unit, - const std::vector& from_values, - const std::vector& to_values, - const std::vector& is_valid) { - // ARROW-9196: make temporal casts work with scalars - CheckCase(timestamp(from_unit), from_values, is_valid, - timestamp(to_unit), to_values, options, - /*check_scalar=*/false); - }; + for (auto types : { + TimestampTypePair{timestamp(TimeUnit::SECOND), timestamp(TimeUnit::MILLI)}, + TimestampTypePair{timestamp(TimeUnit::MILLI), timestamp(TimeUnit::MICRO)}, + TimestampTypePair{timestamp(TimeUnit::MICRO), timestamp(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = ArrayFromJSON(types.fine, "[0, null, 200000, 1000, 2000]"); - std::vector is_valid = {true, false, true, true, true}; + // multiply/promote + CheckCast(coarse, promoted); - // Multiply promotions - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); + auto will_be_truncated = ArrayFromJSON(types.fine, "[0, null, 200456, 1123, 2456]"); - std::vector v2 = {0, 100, 200, 1, 2}; - std::vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); - std::vector v3 = {0, 100, 200, 1, 2}; - std::vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } - std::vector v4 = {0, 100, 200, 1, 2}; - std::vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); + for (auto types : { + TimestampTypePair{timestamp(TimeUnit::SECOND), timestamp(TimeUnit::MICRO)}, + TimestampTypePair{timestamp(TimeUnit::MILLI), timestamp(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = ArrayFromJSON(types.fine, "[0, null, 200000000, 1000000, 2000000]"); - std::vector v5 = {0, 100, 200, 1, 2}; - std::vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); + // multiply/promote + CheckCast(coarse, promoted); - std::vector v6 = {0, 100, 200, 1, 2}; - std::vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); + auto will_be_truncated = + ArrayFromJSON(types.fine, "[0, null, 200456000, 1123000, 2456000]"); - // Zero copy - std::vector v7 = {0, 70000, 2000, 1000, 0}; - std::shared_ptr arr; - ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, &arr); - CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); - // ARROW-1773, cast to integer - CheckZeroCopy(*arr, int64()); + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } - // Divide, truncate - std::vector v8 = {0, 100123, 200456, 1123, 2456}; - std::vector e8 = {0, 100, 200, 1, 2}; + for (auto types : { + TimestampTypePair{timestamp(TimeUnit::SECOND), timestamp(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = + ArrayFromJSON(types.fine, "[0, null, 200000000000, 1000000000, 2000000000]"); - options.allow_time_truncate = true; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); - - std::vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - std::vector e9 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); - - std::vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - std::vector e10 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, - timestamp(TimeUnit::SECOND), options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, - timestamp(TimeUnit::MILLI), options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, - timestamp(TimeUnit::MICRO), options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, - timestamp(TimeUnit::SECOND), options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, - timestamp(TimeUnit::MILLI), options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, - timestamp(TimeUnit::SECOND), options, - /*check_scalar=*/false); - - // Multiply overflow + // multiply/promote + CheckCast(coarse, promoted); - // 1000-01-01, 1800-01-01 , 2000-01-01, 2300-01-01, 3000-01-01 - std::vector v11 = {-30610224000, -5364662400, 946684800, 10413792000, - 32503680000}; + auto will_be_truncated = + ArrayFromJSON(types.fine, "[0, null, 200456000000, 1123000000, 2456000000]"); - options.allow_time_overflow = false; - CheckFails(timestamp(TimeUnit::SECOND), v11, is_valid, - timestamp(TimeUnit::NANO), options, - /*check_scalar=*/false); -} + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); -TEST_F(TestCast, TimestampToDate32_Date64) { - CastOptions options; + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } +} - std::vector is_valid = {true, true, false}; - - // 2000-01-01, 2000-01-02, null - std::vector v_nano = {946684800000000000, 946771200000000000, 0}; - std::vector v_micro = {946684800000000, 946771200000000, 0}; - std::vector v_milli = {946684800000, 946771200000, 0}; - std::vector v_second = {946684800, 946771200, 0}; - std::vector v_day = {10957, 10958, 0}; - - // Simple conversions - CheckCase(timestamp(TimeUnit::NANO), v_nano, is_valid, - date64(), v_milli, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::MICRO), v_micro, is_valid, - date64(), v_milli, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::MILLI), v_milli, is_valid, - date64(), v_milli, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::SECOND), v_second, is_valid, - date64(), v_milli, options, - /*check_scalar=*/false); - - CheckCase(timestamp(TimeUnit::NANO), v_nano, is_valid, - date32(), v_day, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::MICRO), v_micro, is_valid, - date32(), v_day, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::MILLI), v_milli, is_valid, - date32(), v_day, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::SECOND), v_second, is_valid, - date32(), v_day, options, - /*check_scalar=*/false); - - // Disallow truncate, failures - std::vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; - std::vector v_micro_fail = {946684800000001, 946771200000001, 0}; - std::vector v_milli_fail = {946684800001, 946771200001, 0}; - std::vector v_second_fail = {946684801, 946771201, 0}; - - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), - options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), - options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), - options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date64(), options, - /*check_scalar=*/false); - - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), - options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), - options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), - options, - /*check_scalar=*/false); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date32(), options, - /*check_scalar=*/false); - - // Make sure that nulls are excluded from the truncation checks - std::vector v_second_nofail = {946684800, 946771200, 1}; - CheckCase(timestamp(TimeUnit::SECOND), v_second_nofail, - is_valid, date64(), v_milli, options, - /*check_scalar=*/false); - CheckCase(timestamp(TimeUnit::SECOND), v_second_nofail, - is_valid, date32(), v_day, options, - /*check_scalar=*/false); +TEST(Cast, TimestampZeroCopy) { + for (auto zero_copy_to_type : { + timestamp(TimeUnit::SECOND), + int64(), // ARROW-1773, cast to integer + }) { + CheckCastZeroCopy( + ArrayFromJSON(timestamp(TimeUnit::SECOND), "[0, null, 2000, 1000, 0]"), + zero_copy_to_type); + } + CheckCastZeroCopy(ArrayFromJSON(int64(), "[0, null, 2000, 1000, 0]"), + timestamp(TimeUnit::SECOND)); } -TEST_F(TestCast, TimeToCompatible) { +TEST(Cast, TimestampToTimestampMultiplyOverflow) { CastOptions options; + options.to_type = timestamp(TimeUnit::NANO); + // 1000-01-01, 1800-01-01 , 2000-01-01, 2300-01-01, 3000-01-01 + CheckCastFails( + ArrayFromJSON(timestamp(TimeUnit::SECOND), + "[-30610224000, -5364662400, 946684800, 10413792000, 32503680000]"), + options); +} - std::vector is_valid = {true, false, true, true, true}; +TEST(Cast, TimestampToDate) { + for (auto date : { + // 2000-01-01, 2000-01-02, null + ArrayFromJSON(date32(), "[10957, 10958, null]"), + ArrayFromJSON(date64(), "[946684800000, 946771200000, null]"), + }) { + for (auto ts : { + ArrayFromJSON(timestamp(TimeUnit::SECOND), "[946684800, 946771200, null]"), + ArrayFromJSON(timestamp(TimeUnit::MILLI), + "[946684800000, 946771200000, null]"), + ArrayFromJSON(timestamp(TimeUnit::MICRO), + "[946684800000000, 946771200000000, null]"), + ArrayFromJSON(timestamp(TimeUnit::NANO), + "[946684800000000000, 946771200000000000, null]"), + }) { + CheckCast(ts, date); + } - // Multiply promotions - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckCase(time32(TimeUnit::SECOND), v1, is_valid, - time32(TimeUnit::MILLI), e1, options, - /*check_scalar=*/false); - - std::vector v2 = {0, 100, 200, 1, 2}; - std::vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase(time32(TimeUnit::SECOND), v2, is_valid, - time64(TimeUnit::MICRO), e2, options, - /*check_scalar=*/false); - - std::vector v3 = {0, 100, 200, 1, 2}; - std::vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckCase(time32(TimeUnit::SECOND), v3, is_valid, - time64(TimeUnit::NANO), e3, options, - /*check_scalar=*/false); - - std::vector v4 = {0, 100, 200, 1, 2}; - std::vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckCase(time32(TimeUnit::MILLI), v4, is_valid, - time64(TimeUnit::MICRO), e4, options, - /*check_scalar=*/false); - - std::vector v5 = {0, 100, 200, 1, 2}; - std::vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase(time32(TimeUnit::MILLI), v5, is_valid, - time64(TimeUnit::NANO), e5, options, - /*check_scalar=*/false); - - std::vector v6 = {0, 100, 200, 1, 2}; - std::vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckCase(time64(TimeUnit::MICRO), v6, is_valid, - time64(TimeUnit::NANO), e6, options, - /*check_scalar=*/false); - - // Zero copy - std::vector v7 = {0, 70000, 2000, 1000, 0}; - std::shared_ptr arr; - ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int64()); - - std::vector v7_2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int32()); + for (auto ts : { + ArrayFromJSON(timestamp(TimeUnit::SECOND), "[946684801, 946771201, null]"), + ArrayFromJSON(timestamp(TimeUnit::MILLI), + "[946684800001, 946771200001, null]"), + ArrayFromJSON(timestamp(TimeUnit::MICRO), + "[946684800000001, 946771200000001, null]"), + ArrayFromJSON(timestamp(TimeUnit::NANO), + "[946684800000000001, 946771200000000001, null]"), + }) { + auto options = CastOptions::Safe(date->type()); + CheckCastFails(ts, options); + + options.allow_time_truncate = true; + CheckCast(ts, date, options); + } - // Divide, truncate - std::vector v8 = {0, 100123, 200456, 1123, 2456}; - std::vector e8 = {0, 100, 200, 1, 2}; + auto options = CastOptions::Safe(date->type()); + auto ts = ArrayFromJSON(timestamp(TimeUnit::SECOND), "[946684800, 946771200, 1]"); + CheckCastFails(ts, options); - options.allow_time_truncate = true; - CheckCase(time32(TimeUnit::MILLI), v8, is_valid, - time32(TimeUnit::SECOND), e8, options, - /*check_scalar=*/false); - CheckCase(time64(TimeUnit::MICRO), v8, is_valid, - time32(TimeUnit::MILLI), e8, options, - /*check_scalar=*/false); - CheckCase(time64(TimeUnit::NANO), v8, is_valid, - time64(TimeUnit::MICRO), e8, options, - /*check_scalar=*/false); - - std::vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - std::vector e9 = {0, 100, 200, 1, 2}; - CheckCase(time64(TimeUnit::MICRO), v9, is_valid, - time32(TimeUnit::SECOND), e9, options, - /*check_scalar=*/false); - CheckCase(time64(TimeUnit::NANO), v9, is_valid, - time32(TimeUnit::MILLI), e9, options, - /*check_scalar=*/false); - - std::vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - std::vector e10 = {0, 100, 200, 1, 2}; - CheckCase(time64(TimeUnit::NANO), v10, is_valid, - time32(TimeUnit::SECOND), e10, options, - /*check_scalar=*/false); - - // Disallow truncate, failures - - options.allow_time_truncate = false; - CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), - options, /*check_scalar=*/false); - CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), - options, /*check_scalar=*/false); - CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), - options, /*check_scalar=*/false); - CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), - options, /*check_scalar=*/false); - CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), - options, /*check_scalar=*/false); - CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), - options, /*check_scalar=*/false); + // Make sure that nulls are excluded from the truncation checks + CheckCast(MaskArrayWithNullsAt(ts, {2}), date); + } } -TEST_F(TestCast, DateToCompatible) { +TEST(Cast, TimeToTime) { + struct TimeTypePair { + std::shared_ptr coarse, fine; + }; + CastOptions options; - std::vector is_valid = {true, false, true, true, true}; + for (auto types : { + TimeTypePair{time32(TimeUnit::SECOND), time32(TimeUnit::MILLI)}, + TimeTypePair{time32(TimeUnit::MILLI), time64(TimeUnit::MICRO)}, + TimeTypePair{time64(TimeUnit::MICRO), time64(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = ArrayFromJSON(types.fine, "[0, null, 200000, 1000, 2000]"); - constexpr int64_t F = 86400000; + // multiply/promote + CheckCast(coarse, promoted); - // Multiply promotion - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; - CheckCase(date32(), v1, is_valid, date64(), e1, options, - /*check_scalar=*/false); + auto will_be_truncated = ArrayFromJSON(types.fine, "[0, null, 200456, 1123, 2456]"); - // Zero copy - std::vector v2 = {0, 70000, 2000, 1000, 0}; - std::vector v3 = {0, 70000, 2000, 1000, 0}; - std::shared_ptr arr; - ArrayFromVector(date32(), is_valid, v2, &arr); - CheckZeroCopy(*arr, date32()); + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int32()); + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } - ArrayFromVector(date64(), is_valid, v3, &arr); - CheckZeroCopy(*arr, date64()); + for (auto types : { + TimeTypePair{time32(TimeUnit::SECOND), time64(TimeUnit::MICRO)}, + TimeTypePair{time32(TimeUnit::MILLI), time64(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = ArrayFromJSON(types.fine, "[0, null, 200000000, 1000000, 2000000]"); - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int64()); + // multiply/promote + CheckCast(coarse, promoted); - // Divide, truncate - std::vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; - std::vector e8 = {0, 100, 200, 1, 2}; + auto will_be_truncated = + ArrayFromJSON(types.fine, "[0, null, 200456000, 1123000, 2456000]"); - options.allow_time_truncate = true; - CheckCase(date64(), v8, is_valid, date32(), e8, options, - /*check_scalar=*/false); + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(v8, is_valid, date32(), options, /*check_scalar=*/false); -} + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } -TEST_F(TestCast, DurationToCompatible) { - CastOptions options; + for (auto types : { + TimeTypePair{time32(TimeUnit::SECOND), time64(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = + ArrayFromJSON(types.fine, "[0, null, 200000000000, 1000000000, 2000000000]"); - auto CheckDurationCast = - [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, - const std::vector& from_values, - const std::vector& to_values, const std::vector& is_valid) { - CheckCase(duration(from_unit), from_values, is_valid, - duration(to_unit), to_values, options, - /*check_scalar=*/false); - }; + // multiply/promote + CheckCast(coarse, promoted); - std::vector is_valid = {true, false, true, true, true}; + auto will_be_truncated = + ArrayFromJSON(types.fine, "[0, null, 200456000000, 1123000000, 2456000000]"); - // Multiply promotions - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckDurationCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); - std::vector v2 = {0, 100, 200, 1, 2}; - std::vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckDurationCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } +} + +TEST(Cast, TimeZeroCopy) { + for (auto zero_copy_to_type : { + time32(TimeUnit::SECOND), + int32(), // ARROW-1773: cast to int32 + }) { + CheckCastZeroCopy(ArrayFromJSON(time32(TimeUnit::SECOND), "[0, null, 2000, 1000, 0]"), + zero_copy_to_type); + } + CheckCastZeroCopy(ArrayFromJSON(int32(), "[0, null, 2000, 1000, 0]"), + time32(TimeUnit::SECOND)); + + for (auto zero_copy_to_type : { + time64(TimeUnit::MICRO), + int64(), // ARROW-1773: cast to int64 + }) { + CheckCastZeroCopy(ArrayFromJSON(time64(TimeUnit::MICRO), "[0, null, 2000, 1000, 0]"), + zero_copy_to_type); + } + CheckCastZeroCopy(ArrayFromJSON(int64(), "[0, null, 2000, 1000, 0]"), + time64(TimeUnit::MICRO)); +} - std::vector v3 = {0, 100, 200, 1, 2}; - std::vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckDurationCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); +TEST(Cast, DateToDate) { + auto day_32 = ArrayFromJSON(date32(), "[0, null, 100, 1, 10]"); + auto day_64 = ArrayFromJSON(date64(), R"([ + 0, + null, + 8640000000, + 86400000, + 864000000])"); - std::vector v4 = {0, 100, 200, 1, 2}; - std::vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckDurationCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); + // Multiply promotion + CheckCast(day_32, day_64); - std::vector v5 = {0, 100, 200, 1, 2}; - std::vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckDurationCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); + // No truncation + CheckCast(day_64, day_32); - std::vector v6 = {0, 100, 200, 1, 2}; - std::vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckDurationCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); + auto day_64_will_be_truncated = ArrayFromJSON(date64(), R"([ + 0, + null, + 8640000123, + 86400456, + 864000789])"); - // Zero copy - std::vector v7 = {0, 70000, 2000, 1000, 0}; - std::shared_ptr arr; - ArrayFromVector(duration(TimeUnit::SECOND), is_valid, v7, &arr); - CheckZeroCopy(*arr, duration(TimeUnit::SECOND)); - CheckZeroCopy(*arr, int64()); + // Disallow truncate + CastOptions options; + options.to_type = date32(); + CheckCastFails(day_64_will_be_truncated, options); // Divide, truncate - std::vector v8 = {0, 100123, 200456, 1123, 2456}; - std::vector e8 = {0, 100, 200, 1, 2}; - options.allow_time_truncate = true; - CheckDurationCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); - CheckDurationCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); - CheckDurationCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); - - std::vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - std::vector e9 = {0, 100, 200, 1, 2}; - CheckDurationCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); - CheckDurationCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); - - std::vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - std::vector e10 = {0, 100, 200, 1, 2}; - CheckDurationCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(duration(TimeUnit::MILLI), v8, is_valid, - duration(TimeUnit::SECOND), options, /*check_scalar=*/false); - CheckFails(duration(TimeUnit::MICRO), v8, is_valid, - duration(TimeUnit::MILLI), options, /*check_scalar=*/false); - CheckFails(duration(TimeUnit::NANO), v8, is_valid, - duration(TimeUnit::MICRO), options, /*check_scalar=*/false); - CheckFails(duration(TimeUnit::MICRO), v9, is_valid, - duration(TimeUnit::SECOND), options, /*check_scalar=*/false); - CheckFails(duration(TimeUnit::NANO), v9, is_valid, - duration(TimeUnit::MILLI), options, /*check_scalar=*/false); - CheckFails(duration(TimeUnit::NANO), v10, is_valid, - duration(TimeUnit::SECOND), options, /*check_scalar=*/false); - - // Multiply overflow - - // 1000-01-01, 1800-01-01 , 2000-01-01, 2300-01-01, 3000-01-01 - std::vector v11 = {10000000000, 1, 2, 3, 10000000000}; + CheckCast(day_64_will_be_truncated, day_32, options); +} - options.allow_time_overflow = false; - CheckFails(duration(TimeUnit::SECOND), v11, is_valid, - duration(TimeUnit::NANO), options, /*check_scalar=*/false); +TEST(Cast, DateZeroCopy) { + for (auto zero_copy_to_type : { + date32(), + int32(), // ARROW-1773: cast to int32 + }) { + CheckCastZeroCopy(ArrayFromJSON(date32(), "[0, null, 2000, 1000, 0]"), + zero_copy_to_type); + } + CheckCastZeroCopy(ArrayFromJSON(int32(), "[0, null, 2000, 1000, 0]"), date32()); + + for (auto zero_copy_to_type : { + date64(), + int64(), // ARROW-1773: cast to int64 + }) { + CheckCastZeroCopy(ArrayFromJSON(date64(), "[0, null, 2000, 1000, 0]"), + zero_copy_to_type); + } + CheckCastZeroCopy(ArrayFromJSON(int64(), "[0, null, 2000, 1000, 0]"), date64()); } -TEST_F(TestCast, ToDouble) { +TEST(Cast, DurationToDuration) { + struct DurationTypePair { + std::shared_ptr coarse, fine; + }; + CastOptions options; - std::vector is_valid = {true, false, true, true, true}; - // int16 to double - std::vector v1 = {0, 100, 200, 1, 2}; - std::vector e1 = {0, 100, 200, 1, 2}; - CheckCase(v1, is_valid, e1, options); + for (auto types : { + DurationTypePair{duration(TimeUnit::SECOND), duration(TimeUnit::MILLI)}, + DurationTypePair{duration(TimeUnit::MILLI), duration(TimeUnit::MICRO)}, + DurationTypePair{duration(TimeUnit::MICRO), duration(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = ArrayFromJSON(types.fine, "[0, null, 200000, 1000, 2000]"); - // float to double - std::vector v2 = {0, 100, 200, 1, 2}; - std::vector e2 = {0, 100, 200, 1, 2}; - CheckCase(v2, is_valid, e2, options); + // multiply/promote + CheckCast(coarse, promoted); - // bool to double - std::vector v3 = {true, true, false, false, true}; - std::vector e3 = {1, 1, 0, 0, 1}; - CheckCase(v3, is_valid, e3, options); -} + auto will_be_truncated = ArrayFromJSON(types.fine, "[0, null, 200456, 1123, 2456]"); + + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); -TEST_F(TestCast, ChunkedArray) { - std::vector values1 = {0, 1, 2}; - std::vector values2 = {3, 4, 5}; + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } - auto type = int16(); - auto out_type = int64(); + for (auto types : { + DurationTypePair{duration(TimeUnit::SECOND), duration(TimeUnit::MICRO)}, + DurationTypePair{duration(TimeUnit::MILLI), duration(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = ArrayFromJSON(types.fine, "[0, null, 200000000, 1000000, 2000000]"); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); + // multiply/promote + CheckCast(coarse, promoted); - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); + auto will_be_truncated = + ArrayFromJSON(types.fine, "[0, null, 200000456, 1000123, 2000456]"); - CastOptions options; + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); + + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } + + for (auto types : { + DurationTypePair{duration(TimeUnit::SECOND), duration(TimeUnit::NANO)}, + }) { + auto coarse = ArrayFromJSON(types.coarse, "[0, null, 200, 1, 2]"); + auto promoted = + ArrayFromJSON(types.fine, "[0, null, 200000000000, 1000000000, 2000000000]"); + + // multiply/promote + CheckCast(coarse, promoted); + + auto will_be_truncated = + ArrayFromJSON(types.fine, "[0, null, 200000000456, 1000000123, 2000000456]"); + + // with truncation disallowed, fails + options.allow_time_truncate = false; + options.to_type = types.coarse; + CheckCastFails(will_be_truncated, options); + + // with truncation allowed, divide/truncate + options.allow_time_truncate = true; + CheckCast(will_be_truncated, coarse, options); + } +} - ASSERT_OK_AND_ASSIGN(Datum out, Cast(carr, out_type, options)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); +TEST(Cast, DurationZeroCopy) { + for (auto zero_copy_to_type : { + duration(TimeUnit::SECOND), + int64(), // ARROW-1773: cast to int64 + }) { + CheckCastZeroCopy( + ArrayFromJSON(duration(TimeUnit::SECOND), "[0, null, 2000, 1000, 0]"), + zero_copy_to_type); + } + CheckCastZeroCopy(ArrayFromJSON(int64(), "[0, null, 2000, 1000, 0]"), + duration(TimeUnit::SECOND)); +} - auto out_carr = out.chunked_array(); +TEST(Cast, DurationToDurationMultiplyOverflow) { + CastOptions options; + options.to_type = duration(TimeUnit::NANO); + CheckCastFails( + ArrayFromJSON(duration(TimeUnit::SECOND), "[10000000000, 1, 2, 3, 10000000000]"), + options); +} - std::vector ex_values1 = {0, 1, 2}; - std::vector ex_values2 = {3, 4, 5}; - auto a3 = _MakeArray(out_type, ex_values1, {}); - auto a4 = _MakeArray(out_type, ex_values2, {}); +TEST(Cast, MiscToFloating) { + for (auto to_type : {float32(), float64()}) { + CheckCast(ArrayFromJSON(int16(), "[0, null, 200, 1, 2]"), + ArrayFromJSON(to_type, "[0, null, 200, 1, 2]")); - ArrayVector ex_arrays = {a3, a4}; - auto ex_carr = std::make_shared(ex_arrays); + CheckCast(ArrayFromJSON(float32(), "[0, null, 200, 1, 2]"), + ArrayFromJSON(to_type, "[0, null, 200, 1, 2]")); - ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); + CheckCast(ArrayFromJSON(boolean(), "[true, null, false, false, true]"), + ArrayFromJSON(to_type, "[1, null, 0, 0, 1]")); + } } -TEST_F(TestCast, UnsupportedInputType) { +TEST(Cast, UnsupportedInputType) { // Casting to a supported target type, but with an unsupported input type // for the target type. const auto arr = ArrayFromJSON(int32(), "[1, 2, 3]"); @@ -1536,7 +1044,7 @@ TEST_F(TestCast, UnsupportedInputType) { CallFunction("cast", {arr}, &options)); } -TEST_F(TestCast, UnsupportedTargetType) { +TEST(Cast, UnsupportedTargetType) { // Casting to an unsupported target type const auto arr = ArrayFromJSON(int32(), "[1, 2, 3]"); const auto to_type = dense_union({field("a", int32())}); @@ -1553,221 +1061,317 @@ TEST_F(TestCast, UnsupportedTargetType) { CallFunction("cast", {arr}, &options)); } -TEST_F(TestCast, DateTimeZeroCopy) { - std::vector is_valid = {true, false, true, true, true}; +TEST(Cast, StringToBoolean) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(string_type, R"(["False", null, "true", "True", "false"])"), + ArrayFromJSON(boolean(), "[false, null, true, true, false]")); - std::vector v1 = {0, 70000, 2000, 1000, 0}; - std::shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); + CheckCast(ArrayFromJSON(string_type, R"(["0", null, "1", "1", "0"])"), + ArrayFromJSON(boolean(), "[false, null, true, true, false]")); - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - CheckZeroCopy(*arr, date32()); + auto options = CastOptions::Safe(boolean()); + CheckCastFails(ArrayFromJSON(string_type, R"(["false "])"), options); + CheckCastFails(ArrayFromJSON(string_type, R"(["T"])"), options); + } +} - std::vector v2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int64(), is_valid, v2, &arr); +TEST(Cast, StringToInt) { + for (auto string_type : {utf8(), large_utf8()}) { + for (auto signed_type : {int8(), int16(), int32(), int64()}) { + CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0"])"), + ArrayFromJSON(signed_type, "[0, null, 127, -1, 0]")); + } - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - CheckZeroCopy(*arr, date64()); - CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); - CheckZeroCopy(*arr, duration(TimeUnit::MILLI)); -} + CheckCast( + ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", "0"])"), + ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0]")); -TEST_F(TestCast, StringToBoolean) { - CastOptions options; + CheckCast(ArrayFromJSON( + string_type, + R"(["9223372036854775807", null, "-9223372036854775808", "0", "0"])"), + ArrayFromJSON(int64(), + "[9223372036854775807, null, -9223372036854775808, 0, 0]")); - std::vector is_valid = {true, false, true, true, true}; + for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { + CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0"])"), + ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0]")); + } - std::vector v1 = {"False", "true", "true", "True", "false"}; - std::vector v2 = {"0", "1", "1", "1", "0"}; - std::vector e = {false, true, true, true, false}; - CheckCase(utf8(), v1, is_valid, boolean(), e, - options); - CheckCase(utf8(), v2, is_valid, boolean(), e, - options); + CheckCast( + ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", "0"])"), + ArrayFromJSON(uint32(), "[2147483647, null, 4294967295, 0, 0]")); + + CheckCast(ArrayFromJSON( + string_type, + R"(["9223372036854775807", null, "18446744073709551615", "0", "0"])"), + ArrayFromJSON(uint64(), + "[9223372036854775807, null, 18446744073709551615, 0, 0]")); + + for (std::string not_int8 : { + "z", + "12 z", + "128", + "-129", + "0.5", + }) { + auto options = CastOptions::Safe(int8()); + CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); + } - // Same with LargeStringType - CheckCase(v1, is_valid, e, options); + for (std::string not_uint8 : { + "256", + "-1", + "0.5", + }) { + auto options = CastOptions::Safe(uint8()); + CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options); + } + } } -TEST_F(TestCast, StringToBooleanErrors) { - CastOptions options; - - std::vector is_valid = {true}; +TEST(Cast, StringToFloating) { + for (auto string_type : {utf8(), large_utf8()}) { + for (auto float_type : {float32(), float64()}) { + auto strings = + ArrayFromJSON(string_type, R"(["0.1", null, "127.3", "1e3", "200.4", "0.5"])"); + auto floats = ArrayFromJSON(float_type, "[0.1, null, 127.3, 1000, 200.4, 0.5]"); + CheckCast(strings, floats); + + for (std::string not_float : { + "z", + }) { + auto options = CastOptions::Safe(float32()); + CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_float + "\"]"), options); + } - CheckFails({"false "}, is_valid, boolean(), options); - CheckFails({"T"}, is_valid, boolean(), options); - CheckFails({"T"}, is_valid, boolean(), options); +#if !defined(_WIN32) || defined(NDEBUG) + // Test that casting is locale-independent + // French locale uses the comma as decimal point + LocaleGuard locale_guard("fr_FR.UTF-8"); + CheckCast(strings, floats); +#endif + } + } } -TEST_F(TestCast, StringToNumber) { TestCastStringToNumber(); } +TEST(Cast, StringToTimestamp) { + for (auto string_type : {utf8(), large_utf8()}) { + auto strings = ArrayFromJSON(string_type, R"(["1970-01-01", null, "2000-02-29"])"); -TEST_F(TestCast, LargeStringToNumber) { TestCastStringToNumber(); } + CheckCast(strings, + ArrayFromJSON(timestamp(TimeUnit::SECOND), "[0, null, 951782400]")); -TEST_F(TestCast, StringToNumberErrors) { - CastOptions options; + CheckCast(strings, + ArrayFromJSON(timestamp(TimeUnit::MICRO), "[0, null, 951782400000000]")); + + for (auto unit : + {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { + for (std::string not_ts : { + "", + "xxx", + }) { + auto options = CastOptions::Safe(timestamp(unit)); + CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_ts + "\"]"), options); + } + } - std::vector is_valid = {true}; + // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc + } +} - CheckFails({"z"}, is_valid, int8(), options); - CheckFails({"12 z"}, is_valid, int8(), options); - CheckFails({"128"}, is_valid, int8(), options); - CheckFails({"-129"}, is_valid, int8(), options); - CheckFails({"0.5"}, is_valid, int8(), options); +static void AssertBinaryZeroCopy(std::shared_ptr lhs, std::shared_ptr rhs) { + // null bitmap and data buffers are always zero-copied + AssertBufferSame(*lhs, *rhs, 0); + AssertBufferSame(*lhs, *rhs, 2); - CheckFails({"256"}, is_valid, uint8(), options); - CheckFails({"-1"}, is_valid, uint8(), options); + if (offset_bit_width(lhs->type_id()) == offset_bit_width(rhs->type_id())) { + // offset buffer is zero copied if possible + AssertBufferSame(*lhs, *rhs, 1); + return; + } - CheckFails({"z"}, is_valid, float32(), options); + // offset buffers are equivalent + ArrayVector offsets; + for (auto array : {lhs, rhs}) { + auto length = array->length(); + auto buffer = array->data()->buffers[1]; + offsets.push_back(offset_bit_width(array->type_id()) == 32 + ? *Cast(Int32Array(length, buffer), int64()) + : std::make_shared(length, buffer)); + } + AssertArraysEqual(*offsets[0], *offsets[1]); } -TEST_F(TestCast, StringToTimestamp) { TestCastStringToTimestamp(); } +TEST(Cast, BinaryToString) { + for (auto bin_type : {binary(), large_binary()}) { + for (auto string_type : {utf8(), large_utf8()}) { + // empty -> empty always works + CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]")); -TEST_F(TestCast, LargeStringToTimestamp) { TestCastStringToTimestamp(); } + auto invalid_utf8 = InvalidUtf8(bin_type); -TEST_F(TestCast, StringToTimestampErrors) { - CastOptions options; + // invalid utf-8 masked by a null bit is not an error + CheckCast(MaskArrayWithNullsAt(InvalidUtf8(bin_type), {4}), + MaskArrayWithNullsAt(InvalidUtf8(string_type), {4})); - std::vector is_valid = {true}; + // error: invalid utf-8 + auto options = CastOptions::Safe(string_type); + CheckCastFails(invalid_utf8, options); - for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { - auto type = timestamp(unit); - CheckFails({""}, is_valid, type, options); - CheckFails({"xxx"}, is_valid, type, options); + // override utf-8 check + options.allow_invalid_utf8 = true; + ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options)); + ASSERT_RAISES(Invalid, strings->ValidateFull()); + AssertBinaryZeroCopy(invalid_utf8, strings); + } } } -TEST_F(TestCast, BinaryToString) { TestCastBinaryToBinary(); } +TEST(Cast, BinaryOrStringToBinary) { + for (auto from_type : {utf8(), large_utf8(), binary(), large_binary()}) { + for (auto to_type : {binary(), large_binary()}) { + // empty -> empty always works + CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(to_type, "[]")); -TEST_F(TestCast, BinaryToLargeBinary) { - TestCastBinaryToBinary(); -} + auto invalid_utf8 = InvalidUtf8(from_type); -TEST_F(TestCast, BinaryToLargeString) { - TestCastBinaryToBinary(); -} + // invalid utf-8 is not an error for binary + ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, to_type)); + ASSERT_OK(strings->ValidateFull()); + AssertBinaryZeroCopy(invalid_utf8, strings); -TEST_F(TestCast, LargeBinaryToBinary) { - TestCastBinaryToBinary(); + // invalid utf-8 masked by a null bit is not an error + CheckCast(MaskArrayWithNullsAt(InvalidUtf8(from_type), {4}), + MaskArrayWithNullsAt(InvalidUtf8(to_type), {4})); + } + } } -TEST_F(TestCast, LargeBinaryToString) { - TestCastBinaryToBinary(); -} +TEST(Cast, StringToString) { + for (auto from_type : {utf8(), large_utf8()}) { + for (auto to_type : {utf8(), large_utf8()}) { + // empty -> empty always works + CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(to_type, "[]")); -TEST_F(TestCast, LargeBinaryToLargeString) { - TestCastBinaryToBinary(); -} + auto invalid_utf8 = InvalidUtf8(from_type); -TEST_F(TestCast, StringToBinary) { TestCastBinaryToBinary(); } + // invalid utf-8 masked by a null bit is not an error + CheckCast(MaskArrayWithNullsAt(invalid_utf8, {4}), + MaskArrayWithNullsAt(InvalidUtf8(to_type), {4})); -TEST_F(TestCast, StringToLargeBinary) { - TestCastBinaryToBinary(); + // override utf-8 check + auto options = CastOptions::Safe(to_type); + options.allow_invalid_utf8 = true; + // utf-8 is not checked by Cast when the origin guarantees utf-8 + ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, to_type, options)); + ASSERT_RAISES(Invalid, strings->ValidateFull()); + AssertBinaryZeroCopy(invalid_utf8, strings); + } + } } -TEST_F(TestCast, StringToLargeString) { - TestCastBinaryToBinary(); -} +TEST(Cast, IntToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])")); -TEST_F(TestCast, LargeStringToBinary) { - TestCastBinaryToBinary(); -} + CheckCast(ArrayFromJSON(uint8(), "[0, 1, 255, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "255", null])")); -TEST_F(TestCast, LargeStringToString) { - TestCastBinaryToBinary(); -} + CheckCast(ArrayFromJSON(int16(), "[0, 1, 32767, -32768, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "32767", "-32768", null])")); -TEST_F(TestCast, LargeStringToLargeBinary) { - TestCastBinaryToBinary(); -} + CheckCast(ArrayFromJSON(uint16(), "[0, 1, 65535, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "65535", null])")); -TEST_F(TestCast, NumberToString) { TestCastNumberToString(); } + CheckCast( + ArrayFromJSON(int32(), "[0, 1, 2147483647, -2147483648, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "2147483647", "-2147483648", null])")); -TEST_F(TestCast, NumberToLargeString) { TestCastNumberToString(); } + CheckCast(ArrayFromJSON(uint32(), "[0, 1, 4294967295, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "4294967295", null])")); -TEST_F(TestCast, BooleanToString) { TestCastBooleanToString(); } + CheckCast( + ArrayFromJSON(int64(), "[0, 1, 9223372036854775807, -9223372036854775808, null]"), + ArrayFromJSON( + string_type, + R"(["0", "1", "9223372036854775807", "-9223372036854775808", null])")); -TEST_F(TestCast, BooleanToLargeString) { TestCastBooleanToString(); } + CheckCast(ArrayFromJSON(uint64(), "[0, 1, 18446744073709551615, null]"), + ArrayFromJSON(string_type, R"(["0", "1", "18446744073709551615", null])")); + } +} -TEST_F(TestCast, ListToPrimitive) { - auto from_int = ArrayFromJSON(list(int8()), "[[1, 2], [3, 4]]"); - auto from_binary = ArrayFromJSON(list(binary()), "[[\"1\", \"2\"], [\"3\", \"4\"]]"); +TEST(Cast, FloatingToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast( + ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), + ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); - ASSERT_RAISES(NotImplemented, Cast(*from_int, uint8())); - ASSERT_RAISES(NotImplemented, Cast(*from_binary, utf8())); + CheckCast( + ArrayFromJSON(float64(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"), + ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])")); + } } -TEST_F(TestCast, ListToList) { - CastOptions options; - std::shared_ptr offsets; - - std::vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; - std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; - ArrayFromVector(offsets_is_valid, offsets_values, &offsets); - - std::shared_ptr int32_plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - ASSERT_OK_AND_ASSIGN(auto int32_list_array, - ListArray::FromArrays(*offsets, *int32_plain_array, pool_)); - - ASSERT_OK_AND_ASSIGN(std::shared_ptr int64_plain_array, - Cast(*int32_plain_array, int64(), options)); - ASSERT_OK_AND_ASSIGN(auto int64_list_array, - ListArray::FromArrays(*offsets, *int64_plain_array, pool_)); - - ASSERT_OK_AND_ASSIGN(std::shared_ptr float64_plain_array, - Cast(*int32_plain_array, float64(), options)); - ASSERT_OK_AND_ASSIGN(auto float64_list_array, - ListArray::FromArrays(*offsets, *float64_plain_array, pool_)); - - CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options, - /*check_scalar=*/false); - CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options, - /*check_scalar=*/false); - CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options, - /*check_scalar=*/false); - CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options, - /*check_scalar=*/false); - - options.allow_float_truncate = true; - CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options, - /*check_scalar=*/false); - CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options, - /*check_scalar=*/false); +TEST(Cast, BooleanToString) { + for (auto string_type : {utf8(), large_utf8()}) { + CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"), + ArrayFromJSON(string_type, R"(["true", "true", "false", null])")); + } } -TEST_F(TestCast, LargeListToLargeList) { - // Like ListToList above, only testing the basics - CastOptions options; - std::shared_ptr offsets; +TEST(Cast, ListToPrimitive) { + ASSERT_RAISES(NotImplemented, + Cast(*ArrayFromJSON(list(int8()), "[[1, 2], [3, 4]]"), uint8())); - std::vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; - std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; - ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + ASSERT_RAISES( + NotImplemented, + Cast(*ArrayFromJSON(list(binary()), R"([["1", "2"], ["3", "4"]])"), utf8())); +} - std::shared_ptr int32_plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - ASSERT_OK_AND_ASSIGN(auto int32_list_array, - LargeListArray::FromArrays(*offsets, *int32_plain_array, pool_)); +TEST(Cast, ListToList) { + using make_list_t = std::shared_ptr(const std::shared_ptr&); + for (auto make_list : std::vector{&list, &large_list}) { + auto list_int32 = + ArrayFromJSON(make_list(int32()), + "[[0], [1], null, [2, 3, 4], [5, 6], null, [], [7], [8, 9]]") + ->data(); + + auto list_int64 = list_int32->Copy(); + list_int64->type = make_list(int64()); + list_int64->child_data[0] = Cast(list_int32->child_data[0], int64())->array(); + ASSERT_OK(MakeArray(list_int64)->ValidateFull()); + + auto list_float32 = list_int32->Copy(); + list_float32->type = make_list(float32()); + list_float32->child_data[0] = Cast(list_int32->child_data[0], float32())->array(); + ASSERT_OK(MakeArray(list_float32)->ValidateFull()); + + CheckCast(MakeArray(list_int32), MakeArray(list_float32)); + CheckCast(MakeArray(list_float32), MakeArray(list_int64)); + CheckCast(MakeArray(list_int64), MakeArray(list_float32)); + + CheckCast(MakeArray(list_int32), MakeArray(list_int64)); + CheckCast(MakeArray(list_float32), MakeArray(list_int32)); + CheckCast(MakeArray(list_int64), MakeArray(list_int32)); + } +} - ASSERT_OK_AND_ASSIGN(std::shared_ptr float64_plain_array, - Cast(*int32_plain_array, float64(), options)); - ASSERT_OK_AND_ASSIGN(auto float64_list_array, - LargeListArray::FromArrays(*offsets, *float64_plain_array, pool_)); +TEST(Cast, ListToListOptionsPassthru) { + auto list_int32 = ArrayFromJSON(list(int32()), "[[87654321]]"); - CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options, - /*check_scalar=*/false); + auto options = CastOptions::Safe(list(int16())); + CheckCastFails(list_int32, options); - options.allow_float_truncate = true; - CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options, - /*check_scalar=*/false); + options.allow_int_overflow = true; + CheckCast(list_int32, ArrayFromJSON(list(int16()), "[[32689]]"), options); } -TEST_F(TestCast, IdentityCasts) { +TEST(Cast, IdentityCasts) { // ARROW-4102 - auto CheckIdentityCast = [this](std::shared_ptr type, - const std::string& json) { - auto arr = ArrayFromJSON(type, json); - CheckZeroCopy(*arr, type); + auto CheckIdentityCast = [](std::shared_ptr type, const std::string& json) { + CheckCastZeroCopy(ArrayFromJSON(type, json), type); }; CheckIdentityCast(null(), "[null, null, null]"); @@ -1776,9 +1380,9 @@ TEST_F(TestCast, IdentityCasts) { for (auto type : kNumericTypes) { CheckIdentityCast(type, "[1, 2, null, 4]"); } - CheckIdentityCast(binary(), "[\"foo\", \"bar\"]"); - CheckIdentityCast(utf8(), "[\"foo\", \"bar\"]"); - CheckIdentityCast(fixed_size_binary(3), "[\"foo\", \"bar\"]"); + CheckIdentityCast(binary(), R"(["foo", "bar"])"); + CheckIdentityCast(utf8(), R"(["foo", "bar"])"); + CheckIdentityCast(fixed_size_binary(3), R"(["foo", "bar"])"); CheckIdentityCast(list(int8()), "[[1, 2], [null], [], [3]]"); @@ -1788,134 +1392,97 @@ TEST_F(TestCast, IdentityCasts) { CheckIdentityCast(date64(), "[86400000, 0]"); CheckIdentityCast(timestamp(TimeUnit::SECOND), "[1, 2, 3, 4]"); - { - auto dict_values = ArrayFromJSON(int8(), "[1, 2, 3]"); - auto dict_type = dictionary(int8(), dict_values->type()); - auto dict_indices = ArrayFromJSON(int8(), "[0, 1, 2, 0, null, 2]"); - auto dict_array = - std::make_shared(dict_type, dict_indices, dict_values); - CheckZeroCopy(*dict_array, dict_type); - } + CheckIdentityCast(dictionary(int8(), int8()), "[1, 2, 3, 1, null, 3]"); } -TEST_F(TestCast, EmptyCasts) { +TEST(Cast, EmptyCasts) { // ARROW-4766: 0-length arrays should not segfault - auto CheckEmptyCast = [this](std::shared_ptr from, - std::shared_ptr to) { - CastOptions options; - + auto CheckCastEmpty = [](std::shared_ptr from, std::shared_ptr to) { // Python creates array with nullptr instead of 0-length (valid) buffers. auto data = ArrayData::Make(from, /* length */ 0, /* buffers */ {nullptr, nullptr}); - auto input = MakeArray(data); - auto expected = ArrayFromJSON(to, "[]"); - CheckPass(*input, *expected, to, CastOptions{}); + CheckCast(MakeArray(data), ArrayFromJSON(to, "[]")); }; for (auto numeric : kNumericTypes) { - CheckEmptyCast(boolean(), numeric); - CheckEmptyCast(numeric, boolean()); + CheckCastEmpty(boolean(), numeric); + CheckCastEmpty(numeric, boolean()); } } // ---------------------------------------------------------------------- // Test casting from NullType -template -class TestNullCast : public TestCast {}; - -typedef ::testing::Types - TestTypes; - -TYPED_TEST_SUITE(TestNullCast, TestTypes); - -TYPED_TEST(TestNullCast, FromNull) { - // Null casts to everything - const int length = 10; - - // Hack to get a DataType including for parametric types - std::shared_ptr out_type = - TestBase::MakeRandomArray::ArrayType>(0, 0)->type(); - - NullArray arr(length); - - ASSERT_OK_AND_ASSIGN(std::shared_ptr result, Cast(arr, out_type)); - ASSERT_OK(result->ValidateFull()); - - ASSERT_TRUE(result->type()->Equals(*out_type)); - ASSERT_EQ(length, result->length()); - ASSERT_EQ(length, result->null_count()); +TEST(Cast, FromNull) { + for (auto to_type : { + null(), + uint8(), + int8(), + uint16(), + int16(), + uint32(), + int32(), + uint64(), + int64(), + float32(), + float64(), + date32(), + date64(), + fixed_size_binary(10), + binary(), + utf8(), + }) { + ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(to_type, 10)); + CheckCast(std::make_shared(10), expected); + } } // ---------------------------------------------------------------------- // Test casting from DictionaryType -template -class TestDictionaryCast : public TestCast {}; - -typedef ::testing::Types - TestTypes; +TEST(Cast, FromDictionary) { + ArrayVector dictionaries; + dictionaries.push_back(std::make_shared(5)); -TYPED_TEST_SUITE(TestDictionaryCast, TestTypes); + for (auto num_type : kNumericTypes) { + dictionaries.push_back(ArrayFromJSON(num_type, "[23, 12, 45, 12, null]")); + } -TYPED_TEST(TestDictionaryCast, Basic) { - std::shared_ptr dict = - TestBase::MakeRandomArray::ArrayType>(5, 1); - for (auto index_ty : all_dictionary_index_types()) { - auto indices = ArrayFromJSON(index_ty, "[4, 0, 1, 2, 0, 4, null, 2]"); - auto dict_ty = dictionary(index_ty, dict->type()); - auto dict_arr = *DictionaryArray::FromArrays(dict_ty, indices, dict); - std::shared_ptr expected = *Take(*dict, *indices); + for (auto string_type : kBaseBinaryTypes) { + dictionaries.push_back( + ArrayFromJSON(string_type, R"(["foo", "bar", "baz", "foo", null])")); + } - this->CheckPass(*dict_arr, *expected, expected->type(), CastOptions::Safe(), - /*check_scalar=*/false); + for (auto dict : dictionaries) { + for (auto index_type : kDictionaryIndexTypes) { + auto indices = ArrayFromJSON(index_type, "[4, 0, 1, 2, 0, 4, null, 2]"); + ASSERT_OK_AND_ASSIGN(auto expected, Take(*dict, *indices)); - auto opts = CastOptions::Safe(); - opts.to_type = expected->type(); - CheckScalarUnary("cast", dict_arr, expected, &opts); + ASSERT_OK_AND_ASSIGN( + auto dict_arr, DictionaryArray::FromArrays(dictionary(index_type, dict->type()), + indices, dict)); + CheckCast(dict_arr, expected); + } } -} -TYPED_TEST(TestDictionaryCast, NoNulls) { - // Test with a nullptr bitmap buffer (ARROW-3208) - if (TypeParam::type_id == Type::NA) { - // Skip, but gtest doesn't support skipping :-/ - return; - } + for (auto dict : dictionaries) { + if (dict->type_id() == Type::NA) continue; - CastOptions options; - std::shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 0); - ASSERT_EQ(plain_array->null_count(), 0); - - // Dict-encode the plain array - ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(plain_array->data())); - - // Make a new dict array with nullptr bitmap buffer - auto data = encoded.array()->Copy(); - data->buffers[0] = nullptr; - data->null_count = 0; - std::shared_ptr dict_array = std::make_shared(data); - ASSERT_OK(dict_array->ValidateFull()); - - this->CheckPass(*dict_array, *plain_array, plain_array->type(), options, - /*check_scalar=*/false); -} + // Test with a nullptr bitmap buffer (ARROW-3208) + auto indices = ArrayFromJSON(int8(), "[0, 0, 1, 2, 0, 3, 3, 2]"); + ASSERT_OK_AND_ASSIGN(auto no_nulls, Take(*dict, *indices)); + ASSERT_EQ(no_nulls->null_count(), 0); + + ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(no_nulls)); + + // Make a new dict array with nullptr bitmap buffer + auto data = encoded.array()->Copy(); + data->buffers[0] = nullptr; + data->null_count = 0; + std::shared_ptr dict_array = std::make_shared(data); + ASSERT_OK(dict_array->ValidateFull()); -// TODO: See how this might cause problems post-refactor -TYPED_TEST(TestDictionaryCast, DISABLED_OutTypeError) { - // ARROW-7077: unsupported out type should return an error - std::shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(0, 0); - auto in_type = dictionary(int32(), plain_array->type()); - - auto out_type = (plain_array->type()->id() == Type::INT8) ? binary() : int8(); - // Test an output type that's not part of TestTypes. - out_type = list(in_type); - ASSERT_RAISES(NotImplemented, GetCastFunction(out_type)); + CheckCast(dict_array, no_nulls); + } } std::shared_ptr SmallintArrayFromJSON(const std::string& json_data) { @@ -1925,44 +1492,41 @@ std::shared_ptr SmallintArrayFromJSON(const std::string& json_data) { return MakeArray(ext_data); } -TEST_F(TestCast, ExtensionTypeToIntDowncast) { +TEST(Cast, ExtensionTypeToIntDowncast) { auto smallint = std::make_shared(); ExtensionTypeGuard smallint_guard(smallint); - CastOptions options; - options.allow_int_overflow = false; - std::shared_ptr result; std::vector is_valid = {true, false, true, true, true}; // Smallint(int16) to int16 - auto v0 = SmallintArrayFromJSON("[0, 100, 200, 1, 2]"); - CheckZeroCopy(*v0, int16()); + CheckCastZeroCopy(SmallintArrayFromJSON("[0, 100, 200, 1, 2]"), int16()); // Smallint(int16) to uint8, no overflow/underrun - auto v1 = SmallintArrayFromJSON("[0, 100, 200, 1, 2]"); - auto e1 = ArrayFromJSON(uint8(), "[0, 100, 200, 1, 2]"); - CheckPass(*v1, *e1, uint8(), options, /*check_scalar=*/false); + CheckCast(SmallintArrayFromJSON("[0, 100, 200, 1, 2]"), + ArrayFromJSON(uint8(), "[0, 100, 200, 1, 2]")); // Smallint(int16) to uint8, with overflow - auto v2 = SmallintArrayFromJSON("[0, null, 256, 1, 3]"); - auto e2 = ArrayFromJSON(uint8(), "[0, null, 0, 1, 3]"); - // allow overflow - options.allow_int_overflow = true; - CheckPass(*v2, *e2, uint8(), options, /*check_scalar=*/false); - // disallow overflow - options.allow_int_overflow = false; - ASSERT_RAISES(Invalid, Cast(*v2, uint8(), options)); + { + CastOptions options; + options.to_type = uint8(); + CheckCastFails(SmallintArrayFromJSON("[0, null, 256, 1, 3]"), options); + + options.allow_int_overflow = true; + CheckCast(SmallintArrayFromJSON("[0, null, 256, 1, 3]"), + ArrayFromJSON(uint8(), "[0, null, 0, 1, 3]"), options); + } // Smallint(int16) to uint8, with underflow - auto v3 = SmallintArrayFromJSON("[0, null, -1, 1, 0]"); - auto e3 = ArrayFromJSON(uint8(), "[0, null, 255, 1, 0]"); - // allow overflow - options.allow_int_overflow = true; - CheckPass(*v3, *e3, uint8(), options, /*check_scalar=*/false); - // disallow overflow - options.allow_int_overflow = false; - ASSERT_RAISES(Invalid, Cast(*v3, uint8(), options)); + { + CastOptions options; + options.to_type = uint8(); + CheckCastFails(SmallintArrayFromJSON("[0, null, -1, 1, 3]"), options); + + options.allow_int_overflow = true; + CheckCast(SmallintArrayFromJSON("[0, null, -1, 1, 3]"), + ArrayFromJSON(uint8(), "[0, null, 255, 1, 3]"), options); + } } } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc index 73e900351fb04..a8a0c8b95f322 100644 --- a/cpp/src/arrow/compute/kernels/test_util.cc +++ b/cpp/src/arrow/compute/kernels/test_util.cc @@ -95,9 +95,13 @@ void CheckScalar(std::string func_name, const ArrayVector& inputs, std::shared_ptr expected, const FunctionOptions* options) { CheckScalarNonRecursive(func_name, inputs, expected, options); - // Check all the input scalars - for (int64_t i = 0; i < inputs[0]->length(); ++i) { - CheckScalar(func_name, GetScalars(inputs, i), *expected->GetScalar(i), options); + // Check all the input scalars, if scalars are implemented + if (std::none_of(inputs.begin(), inputs.end(), [](const std::shared_ptr& array) { + return array->type_id() == Type::EXTENSION; + })) { + for (int64_t i = 0; i < inputs[0]->length(); ++i) { + CheckScalar(func_name, GetScalars(inputs, i), *expected->GetScalar(i), options); + } } // Since it's a scalar function, calling it on sliced inputs should diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index 84b1a103264c8..2766cfd3bd2bd 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -21,7 +21,8 @@ add_arrow_test(csv-test column_builder_test.cc column_decoder_test.cc converter_test.cc - parser_test.cc) + parser_test.cc + reader_test.cc) add_arrow_benchmark(converter_benchmark PREFIX "arrow-csv") add_arrow_benchmark(parser_benchmark PREFIX "arrow-csv") diff --git a/cpp/src/arrow/csv/column_decoder.cc b/cpp/src/arrow/csv/column_decoder.cc index c57477ef59d3b..1dd13bc9086b2 100644 --- a/cpp/src/arrow/csv/column_decoder.cc +++ b/cpp/src/arrow/csv/column_decoder.cc @@ -84,7 +84,7 @@ class ConcreteColumnDecoder : public ColumnDecoder { auto chunk_index = next_chunk_++; WaitForChunkUnlocked(chunk_index); // Move Future to avoid keeping chunk alive - return std::move(chunks_[chunk_index]).result(); + return chunks_[chunk_index].MoveResult(); } protected: diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index cf5047aaf16a2..f0fa1f206d344 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -40,6 +40,8 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/future.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -60,8 +62,7 @@ class InputStream; namespace csv { -using internal::GetCpuThreadPool; -using internal::ThreadPool; +using internal::Executor; struct ConversionSchema { struct Column { @@ -94,20 +95,24 @@ struct ConversionSchema { // An iterator of Buffers that makes sure there is no straddling CRLF sequence. class CSVBufferIterator { public: - explicit CSVBufferIterator(Iterator> buffer_iterator) - : buffer_iterator_(std::move(buffer_iterator)) {} - static Iterator> Make( Iterator> buffer_iterator) { - CSVBufferIterator it(std::move(buffer_iterator)); - return Iterator>(std::move(it)); + Transformer, std::shared_ptr> fn = + CSVBufferIterator(); + return MakeTransformedIterator(std::move(buffer_iterator), fn); + } + + static AsyncGenerator> MakeAsync( + AsyncGenerator> buffer_iterator) { + Transformer, std::shared_ptr> fn = + CSVBufferIterator(); + return MakeAsyncGenerator(std::move(buffer_iterator), fn); } - Result> Next() { - ARROW_ASSIGN_OR_RAISE(auto buf, buffer_iterator_.Next()); + Result>> operator()(std::shared_ptr buf) { if (buf == nullptr) { // EOF - return nullptr; + return TransformFinish(); } int64_t offset = 0; @@ -127,14 +132,13 @@ class CSVBufferIterator { buf = SliceBuffer(buf, offset); if (buf->size() == 0) { // EOF - return nullptr; + return TransformFinish(); } else { - return buf; + return TransformYield(buf); } } protected: - Iterator> buffer_iterator_; bool first_buffer_ = true; // Whether there was a trailing CR at the end of last received buffer bool trailing_cr_ = false; @@ -150,20 +154,36 @@ struct CSVBlock { std::function consume_bytes; }; +} // namespace csv + +template <> +struct IterationTraits { + static csv::CSVBlock End() { return csv::CSVBlock{{}, {}, {}, -1, true, {}}; } +}; + +namespace csv { + +// The == operator must be defined to be used as T in Iterator +bool operator==(const CSVBlock& left, const CSVBlock& right) { + return left.block_index == right.block_index; +} +bool operator!=(const CSVBlock& left, const CSVBlock& right) { + return left.block_index != right.block_index; +} + +// This is a callable that can be used to transform an iterator. The source iterator +// will contain buffers of data and the output iterator will contain delimited CSV +// blocks. util::optional is used so that there is an end token (required by the +// iterator APIs (e.g. Visit)) even though an empty optional is never used in this code. class BlockReader { public: - BlockReader(std::unique_ptr chunker, - Iterator> buffer_iterator, - std::shared_ptr first_buffer) + BlockReader(std::unique_ptr chunker, std::shared_ptr first_buffer) : chunker_(std::move(chunker)), - buffer_iterator_(std::move(buffer_iterator)), partial_(std::make_shared("")), buffer_(std::move(first_buffer)) {} protected: std::unique_ptr chunker_; - Iterator> buffer_iterator_; - std::shared_ptr partial_, buffer_; int64_t block_index_ = 0; // Whether there was a trailing CR at the end of last received buffer @@ -177,14 +197,25 @@ class SerialBlockReader : public BlockReader { public: using BlockReader::BlockReader; - Result> Next() { + static Iterator MakeIterator( + Iterator> buffer_iterator, std::unique_ptr chunker, + std::shared_ptr first_buffer) { + auto block_reader = + std::make_shared(std::move(chunker), first_buffer); + // Wrap shared pointer in callable + Transformer, CSVBlock> block_reader_fn = + [block_reader](std::shared_ptr buf) { + return (*block_reader)(std::move(buf)); + }; + return MakeTransformedIterator(std::move(buffer_iterator), block_reader_fn); + } + + Result> operator()(std::shared_ptr next_buffer) { if (buffer_ == nullptr) { - // EOF - return util::optional(); + return TransformFinish(); } - std::shared_ptr next_buffer, completion; - ARROW_ASSIGN_OR_RAISE(next_buffer, buffer_iterator_.Next()); + std::shared_ptr completion; bool is_final = (next_buffer == nullptr); if (is_final) { @@ -210,8 +241,9 @@ class SerialBlockReader : public BlockReader { return Status::OK(); }; - return CSVBlock{partial_, completion, buffer_, - block_index_++, is_final, std::move(consume_bytes)}; + return TransformYield(CSVBlock{partial_, completion, buffer_, + block_index_++, is_final, + std::move(consume_bytes)}); } }; @@ -220,14 +252,35 @@ class ThreadedBlockReader : public BlockReader { public: using BlockReader::BlockReader; - Result> Next() { + static Iterator MakeIterator( + Iterator> buffer_iterator, std::unique_ptr chunker, + std::shared_ptr first_buffer) { + auto block_reader = + std::make_shared(std::move(chunker), first_buffer); + // Wrap shared pointer in callable + Transformer, CSVBlock> block_reader_fn = + [block_reader](std::shared_ptr next) { return (*block_reader)(next); }; + return MakeTransformedIterator(std::move(buffer_iterator), block_reader_fn); + } + + static AsyncGenerator MakeAsyncIterator( + AsyncGenerator> buffer_generator, + std::unique_ptr chunker, std::shared_ptr first_buffer) { + auto block_reader = + std::make_shared(std::move(chunker), first_buffer); + // Wrap shared pointer in callable + Transformer, CSVBlock> block_reader_fn = + [block_reader](std::shared_ptr next) { return (*block_reader)(next); }; + return MakeAsyncGenerator(std::move(buffer_generator), block_reader_fn); + } + + Result> operator()(std::shared_ptr next_buffer) { if (buffer_ == nullptr) { // EOF - return util::optional(); + return TransformFinish(); } - std::shared_ptr next_buffer, whole, completion, next_partial; - ARROW_ASSIGN_OR_RAISE(next_buffer, buffer_iterator_.Next()); + std::shared_ptr whole, completion, next_partial; bool is_final = (next_buffer == nullptr); auto current_partial = std::move(partial_); @@ -252,7 +305,8 @@ class ThreadedBlockReader : public BlockReader { partial_ = std::move(next_partial); buffer_ = std::move(next_buffer); - return CSVBlock{current_partial, completion, whole, block_index_++, is_final, {}}; + return TransformYield( + CSVBlock{current_partial, completion, whole, block_index_++, is_final, {}}); } }; @@ -449,7 +503,6 @@ class ReaderMixin { ConversionSchema conversion_schema_; std::shared_ptr input_; - Iterator> buffer_iterator_; std::shared_ptr task_group_; }; @@ -462,6 +515,10 @@ class BaseTableReader : public ReaderMixin, public csv::TableReader { virtual Status Init() = 0; + Future> ReadAsync() override { + return Future>::MakeFinished(Read()); + } + protected: // Make column builders from conversion schema Status MakeColumnBuilders() { @@ -624,6 +681,7 @@ class BaseStreamingReader : public ReaderMixin, public csv::StreamingReader { std::vector> column_decoders_; std::shared_ptr schema_; std::shared_ptr pending_batch_; + Iterator> buffer_iterator_; bool eof_ = false; }; @@ -656,7 +714,7 @@ class SerialStreamingReader : public BaseStreamingReader { if (eof_) { return nullptr; } - if (block_reader_ == nullptr) { + if (!block_iterator_) { Status st = SetupReader(); if (!st.ok()) { // Can't setup reader => bail out @@ -670,18 +728,18 @@ class SerialStreamingReader : public BaseStreamingReader { } if (!source_eof_) { - ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_reader_->Next()); - if (maybe_block.has_value()) { - last_block_index_ = maybe_block->block_index; - auto maybe_parsed = ParseAndInsert(maybe_block->partial, maybe_block->completion, - maybe_block->buffer, maybe_block->block_index, - maybe_block->is_final); + ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator_.Next()); + if (maybe_block != IterationTraits::End()) { + last_block_index_ = maybe_block.block_index; + auto maybe_parsed = ParseAndInsert(maybe_block.partial, maybe_block.completion, + maybe_block.buffer, maybe_block.block_index, + maybe_block.is_final); if (!maybe_parsed.ok()) { // Parse error => bail out eof_ = true; return maybe_parsed.status(); } - RETURN_NOT_OK(maybe_block->consume_bytes(*maybe_parsed)); + RETURN_NOT_OK(maybe_block.consume_bytes(*maybe_parsed)); } else { source_eof_ = true; for (auto& decoder : column_decoders_) { @@ -705,15 +763,15 @@ class SerialStreamingReader : public BaseStreamingReader { RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer)); RETURN_NOT_OK(MakeColumnDecoders()); - block_reader_ = std::make_shared(MakeChunker(parse_options_), - std::move(buffer_iterator_), - std::move(first_buffer)); + block_iterator_ = SerialBlockReader::MakeIterator(std::move(buffer_iterator_), + MakeChunker(parse_options_), + std::move(first_buffer)); return Status::OK(); } bool source_eof_ = false; int64_t last_block_index_ = 0; - std::shared_ptr block_reader_; + Iterator block_iterator_; }; ///////////////////////////////////////////////////////////////////////// @@ -746,41 +804,46 @@ class SerialTableReader : public BaseTableReader { RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer)); RETURN_NOT_OK(MakeColumnBuilders()); - SerialBlockReader block_reader(MakeChunker(parse_options_), - std::move(buffer_iterator_), std::move(first_buffer)); - + auto block_iterator = SerialBlockReader::MakeIterator(std::move(buffer_iterator_), + MakeChunker(parse_options_), + std::move(first_buffer)); while (true) { - ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_reader.Next()); - if (!maybe_block.has_value()) { + ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator.Next()); + if (maybe_block == IterationTraits::End()) { // EOF break; } - ARROW_ASSIGN_OR_RAISE(int64_t parsed_bytes, - ParseAndInsert(maybe_block->partial, maybe_block->completion, - maybe_block->buffer, maybe_block->block_index, - maybe_block->is_final)); - RETURN_NOT_OK(maybe_block->consume_bytes(parsed_bytes)); + ARROW_ASSIGN_OR_RAISE( + int64_t parsed_bytes, + ParseAndInsert(maybe_block.partial, maybe_block.completion, maybe_block.buffer, + maybe_block.block_index, maybe_block.is_final)); + RETURN_NOT_OK(maybe_block.consume_bytes(parsed_bytes)); } // Finish conversion, create schema and table RETURN_NOT_OK(task_group_->Finish()); return MakeTable(); } -}; -///////////////////////////////////////////////////////////////////////// -// Parallel TableReader implementation + protected: + Iterator> buffer_iterator_; +}; -class ThreadedTableReader : public BaseTableReader { +class AsyncThreadedTableReader + : public BaseTableReader, + public std::enable_shared_from_this { public: using BaseTableReader::BaseTableReader; - ThreadedTableReader(MemoryPool* pool, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options, ThreadPool* thread_pool) + AsyncThreadedTableReader(MemoryPool* pool, std::shared_ptr input, + const ReadOptions& read_options, + const ParseOptions& parse_options, + const ConvertOptions& convert_options, Executor* cpu_executor, + Executor* io_executor) : BaseTableReader(pool, input, read_options, parse_options, convert_options), - thread_pool_(thread_pool) {} + cpu_executor_(cpu_executor), + io_executor_(io_executor) {} - ~ThreadedTableReader() override { + ~AsyncThreadedTableReader() override { if (task_group_) { // In case of error, make sure all pending tasks are finished before // we start destroying BaseTableReader members @@ -792,65 +855,98 @@ class ThreadedTableReader : public BaseTableReader { ARROW_ASSIGN_OR_RAISE(auto istream_it, io::MakeInputStreamIterator(input_, read_options_.block_size)); - int32_t block_queue_size = thread_pool_->GetCapacity(); - ARROW_ASSIGN_OR_RAISE(auto rh_it, - MakeReadaheadIterator(std::move(istream_it), block_queue_size)); - buffer_iterator_ = CSVBufferIterator::Make(std::move(rh_it)); + // TODO: use io_executor_ here, see ARROW-11590 + ARROW_ASSIGN_OR_RAISE(auto background_executor, internal::ThreadPool::Make(1)); + ARROW_ASSIGN_OR_RAISE(auto bg_it, MakeBackgroundGenerator(std::move(istream_it), + background_executor.get())); + AsyncGenerator> wrapped_bg_it = + [bg_it, background_executor]() { return bg_it(); }; + + auto transferred_it = + MakeTransferredGenerator(std::move(wrapped_bg_it), cpu_executor_); + + int32_t block_queue_size = cpu_executor_->GetCapacity(); + auto rh_it = MakeReadaheadGenerator(std::move(transferred_it), block_queue_size); + buffer_generator_ = CSVBufferIterator::MakeAsync(std::move(rh_it)); return Status::OK(); } - Result> Read() override { - task_group_ = internal::TaskGroup::MakeThreaded(thread_pool_); + Result> Read() override { return ReadAsync().result(); } + + Future> ReadAsync() override { + task_group_ = internal::TaskGroup::MakeThreaded(cpu_executor_); + + auto self = shared_from_this(); + return ProcessFirstBuffer().Then([self](std::shared_ptr first_buffer) { + auto block_generator = ThreadedBlockReader::MakeAsyncIterator( + self->buffer_generator_, MakeChunker(self->parse_options_), + std::move(first_buffer)); + + std::function block_visitor = + [self](CSVBlock maybe_block) -> Status { + // The logic in VisitAsyncGenerator ensures that we will never be + // passed an empty block (visit does not call with the end token) so + // we can be assured maybe_block has a value. + DCHECK_GE(maybe_block.block_index, 0); + DCHECK(!maybe_block.consume_bytes); + + // Launch parse task + self->task_group_->Append([self, maybe_block] { + return self + ->ParseAndInsert(maybe_block.partial, maybe_block.completion, + maybe_block.buffer, maybe_block.block_index, + maybe_block.is_final) + .status(); + }); + return Status::OK(); + }; + + return VisitAsyncGenerator(std::move(block_generator), block_visitor) + .Then([self](...) -> Future<> { + // By this point we've added all top level tasks so it is safe to call + // FinishAsync + return self->task_group_->FinishAsync(); + }) + .Then([self](...) -> Result> { + // Finish conversion, create schema and table + return self->MakeTable(); + }); + }); + } + protected: + Future> ProcessFirstBuffer() { // First block - ARROW_ASSIGN_OR_RAISE(auto first_buffer, buffer_iterator_.Next()); - if (first_buffer == nullptr) { - return Status::Invalid("Empty CSV file"); - } - RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer)); - RETURN_NOT_OK(MakeColumnBuilders()); - - ThreadedBlockReader block_reader(MakeChunker(parse_options_), - std::move(buffer_iterator_), - std::move(first_buffer)); - - while (true) { - ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_reader.Next()); - if (!maybe_block.has_value()) { - // EOF - break; + auto first_buffer_future = buffer_generator_(); + return first_buffer_future.Then([this](const std::shared_ptr& first_buffer) + -> Result> { + if (first_buffer == nullptr) { + return Status::Invalid("Empty CSV file"); } - DCHECK(!maybe_block->consume_bytes); - - // Launch parse task - task_group_->Append([this, maybe_block] { - return ParseAndInsert(maybe_block->partial, maybe_block->completion, - maybe_block->buffer, maybe_block->block_index, - maybe_block->is_final) - .status(); - }); - } - - // Finish conversion, create schema and table - RETURN_NOT_OK(task_group_->Finish()); - return MakeTable(); + std::shared_ptr first_buffer_processed; + RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer_processed)); + RETURN_NOT_OK(MakeColumnBuilders()); + return first_buffer_processed; + }); } - protected: - ThreadPool* thread_pool_; + Executor* cpu_executor_; + Executor* io_executor_; + AsyncGenerator> buffer_generator_; }; ///////////////////////////////////////////////////////////////////////// // Factory functions Result> TableReader::Make( - MemoryPool* pool, std::shared_ptr input, - const ReadOptions& read_options, const ParseOptions& parse_options, - const ConvertOptions& convert_options) { + MemoryPool* pool, io::AsyncContext async_context, + std::shared_ptr input, const ReadOptions& read_options, + const ParseOptions& parse_options, const ConvertOptions& convert_options) { std::shared_ptr reader; if (read_options.use_threads) { - reader = std::make_shared( - pool, input, read_options, parse_options, convert_options, GetCpuThreadPool()); + reader = std::make_shared( + pool, input, read_options, parse_options, convert_options, async_context.executor, + internal::GetCpuThreadPool()); } else { reader = std::make_shared(pool, input, read_options, parse_options, convert_options); @@ -871,4 +967,5 @@ Result> StreamingReader::Make( } } // namespace csv + } // namespace arrow diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h index 652cedc8c74c5..c361fbddce97c 100644 --- a/cpp/src/arrow/csv/reader.h +++ b/cpp/src/arrow/csv/reader.h @@ -20,10 +20,12 @@ #include #include "arrow/csv/options.h" // IWYU pragma: keep +#include "arrow/io/interfaces.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/future.h" #include "arrow/util/visibility.h" namespace arrow { @@ -40,9 +42,12 @@ class ARROW_EXPORT TableReader { /// Read the entire CSV file and convert it to a Arrow Table virtual Result> Read() = 0; + /// Read the entire CSV file and convert it to a Arrow Table + virtual Future> ReadAsync() = 0; /// Create a TableReader instance static Result> Make(MemoryPool* pool, + io::AsyncContext async_context, std::shared_ptr input, const ReadOptions&, const ParseOptions&, diff --git a/cpp/src/arrow/csv/reader_test.cc b/cpp/src/arrow/csv/reader_test.cc new file mode 100644 index 0000000000000..64010ae481ac4 --- /dev/null +++ b/cpp/src/arrow/csv/reader_test.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include + +#include "arrow/csv/options.h" +#include "arrow/csv/reader.h" +#include "arrow/csv/test_common.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/future.h" +#include "arrow/util/thread_pool.h" + +namespace arrow { +namespace csv { + +using TableReaderFactory = + std::function>(std::shared_ptr)>; + +void StressTableReader(TableReaderFactory reader_factory) { + const int NTASKS = 100; + const int NROWS = 1000; + ASSERT_OK_AND_ASSIGN(auto table_buffer, MakeSampleCsvBuffer(NROWS)); + + std::vector>> task_futures(NTASKS); + for (int i = 0; i < NTASKS; i++) { + auto input = std::make_shared(table_buffer); + ASSERT_OK_AND_ASSIGN(auto reader, reader_factory(input)); + task_futures[i] = reader->ReadAsync(); + } + auto combined_future = All(task_futures); + combined_future.Wait(); + + ASSERT_OK_AND_ASSIGN(std::vector>> results, + combined_future.result()); + for (auto&& result : results) { + ASSERT_OK_AND_ASSIGN(auto table, result); + ASSERT_EQ(NROWS, table->num_rows()); + } +} + +void StressInvalidTableReader(TableReaderFactory reader_factory) { + const int NTASKS = 100; + const int NROWS = 1000; + ASSERT_OK_AND_ASSIGN(auto table_buffer, MakeSampleCsvBuffer(NROWS, false)); + + std::vector>> task_futures(NTASKS); + for (int i = 0; i < NTASKS; i++) { + auto input = std::make_shared(table_buffer); + ASSERT_OK_AND_ASSIGN(auto reader, reader_factory(input)); + task_futures[i] = reader->ReadAsync(); + } + auto combined_future = All(task_futures); + combined_future.Wait(); + + ASSERT_OK_AND_ASSIGN(std::vector>> results, + combined_future.result()); + for (auto&& result : results) { + ASSERT_RAISES(Invalid, result); + } +} + +void TestNestedParallelism(std::shared_ptr thread_pool, + TableReaderFactory reader_factory) { + const int NROWS = 1000; + ASSERT_OK_AND_ASSIGN(auto table_buffer, MakeSampleCsvBuffer(NROWS)); + auto input = std::make_shared(table_buffer); + ASSERT_OK_AND_ASSIGN(auto reader, reader_factory(input)); + + Future> table_future; + + auto read_task = [&reader, &table_future]() mutable { + table_future = reader->ReadAsync(); + return Status::OK(); + }; + ASSERT_OK_AND_ASSIGN(auto future, thread_pool->Submit(read_task)); + + ASSERT_FINISHES_OK(future); + ASSERT_FINISHES_OK_AND_ASSIGN(auto table, table_future); + ASSERT_EQ(table->num_rows(), NROWS); +} // namespace csv + +TableReaderFactory MakeSerialFactory() { + return [](std::shared_ptr input_stream) { + auto read_options = ReadOptions::Defaults(); + read_options.block_size = 1 << 10; + read_options.use_threads = false; + return TableReader::Make(default_memory_pool(), io::AsyncContext(), input_stream, + read_options, ParseOptions::Defaults(), + ConvertOptions::Defaults()); + }; +} + +TEST(SerialReaderTests, Stress) { StressTableReader(MakeSerialFactory()); } +TEST(SerialReaderTests, StressInvalid) { StressInvalidTableReader(MakeSerialFactory()); } +TEST(SerialReaderTests, NestedParallelism) { + ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(1)); + TestNestedParallelism(thread_pool, MakeSerialFactory()); +} + +Result MakeAsyncFactory( + std::shared_ptr thread_pool = nullptr) { + if (!thread_pool) { + ARROW_ASSIGN_OR_RAISE(thread_pool, internal::ThreadPool::Make(1)); + } + return [thread_pool](std::shared_ptr input_stream) + -> Result> { + ReadOptions read_options = ReadOptions::Defaults(); + read_options.use_threads = true; + read_options.block_size = 1 << 10; + auto table_reader = TableReader::Make( + default_memory_pool(), io::AsyncContext(thread_pool.get()), input_stream, + read_options, ParseOptions::Defaults(), ConvertOptions::Defaults()); + return table_reader; + }; +} + +TEST(AsyncReaderTests, Stress) { + ASSERT_OK_AND_ASSIGN(auto table_factory, MakeAsyncFactory()); + StressTableReader(table_factory); +} +TEST(AsyncReaderTests, StressInvalid) { + ASSERT_OK_AND_ASSIGN(auto table_factory, MakeAsyncFactory()); + StressInvalidTableReader(table_factory); +} +TEST(AsyncReaderTests, NestedParallelism) { + ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(1)); + ASSERT_OK_AND_ASSIGN(auto table_factory, MakeAsyncFactory(thread_pool)); + TestNestedParallelism(thread_pool, table_factory); +} + +} // namespace csv +} // namespace arrow diff --git a/cpp/src/arrow/csv/test_common.cc b/cpp/src/arrow/csv/test_common.cc index 08981a705010d..c3d0241aa3861 100644 --- a/cpp/src/arrow/csv/test_common.cc +++ b/cpp/src/arrow/csv/test_common.cc @@ -61,5 +61,59 @@ void MakeColumnParser(std::vector items, std::shared_ptrnum_rows(), items.size()); } +namespace { + +const std::vector int64_rows = {"123", "4", "-317005557", "", "N/A", "0"}; +const std::vector float_rows = {"0", "123.456", "-3170.55766", "", "N/A"}; +const std::vector decimal128_rows = {"0", "123.456", "-3170.55766", + "", "N/A", "1233456789.123456789"}; +const std::vector iso8601_rows = {"1917-10-17", "2018-09-13", + "1941-06-22 04:00", "1945-05-09 09:45:38"}; +const std::vector strptime_rows = {"10/17/1917", "9/13/2018", "9/5/1945"}; + +static void WriteHeader(std::ostream& writer) { + writer << "Int64,Float,Decimal128,ISO8601,Strptime" << std::endl; +} + +static std::string GetCell(const std::vector& base_rows, size_t row_index) { + return base_rows[row_index % base_rows.size()]; +} + +static void WriteRow(std::ostream& writer, size_t row_index) { + writer << GetCell(int64_rows, row_index); + writer << ','; + writer << GetCell(float_rows, row_index); + writer << ','; + writer << GetCell(decimal128_rows, row_index); + writer << ','; + writer << GetCell(iso8601_rows, row_index); + writer << ','; + writer << GetCell(strptime_rows, row_index); + writer << std::endl; +} + +static void WriteInvalidRow(std::ostream& writer, size_t row_index) { + writer << "\"" << std::endl << "\""; + writer << std::endl; +} +} // namespace + +Result> MakeSampleCsvBuffer(size_t num_rows, bool valid) { + std::stringstream writer; + + WriteHeader(writer); + for (size_t i = 0; i < num_rows; ++i) { + if (i == num_rows / 2 && !valid) { + WriteInvalidRow(writer, i); + } else { + WriteRow(writer, i); + } + } + + auto table_str = writer.str(); + auto table_buffer = std::make_shared(table_str); + return MemoryManager::CopyBuffer(table_buffer, default_cpu_memory_manager()); +} + } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/csv/test_common.h b/cpp/src/arrow/csv/test_common.h index 119da03a83df9..823cf643fa022 100644 --- a/cpp/src/arrow/csv/test_common.h +++ b/cpp/src/arrow/csv/test_common.h @@ -46,5 +46,8 @@ void MakeCSVParser(std::vector lines, std::shared_ptr* ARROW_TESTING_EXPORT void MakeColumnParser(std::vector items, std::shared_ptr* out); +ARROW_TESTING_EXPORT +Result> MakeSampleCsvBuffer(size_t num_rows, bool valid = true); + } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc index ae62283b1d7a6..2f0110255ec42 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/dataset/expression_test.cc @@ -160,7 +160,7 @@ TEST(Expression, ToString) { "\"617A\""); auto ts = *MakeScalar("1990-10-23 10:23:33")->CastTo(timestamp(TimeUnit::NANO)); - EXPECT_EQ(literal(ts).ToString(), "656677413000000000"); + EXPECT_EQ(literal(ts).ToString(), "1990-10-23 10:23:33.000000000"); EXPECT_EQ(call("add", {literal(3), field_ref("beta")}).ToString(), "add(3, beta)"); diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0d49cd72135ba..c26ad0490bad6 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -18,6 +18,7 @@ #include "arrow/dataset/file_parquet.h" #include +#include #include #include #include @@ -54,12 +55,20 @@ class ParquetScanTask : public ScanTask { public: ParquetScanTask(int row_group, std::vector column_projection, std::shared_ptr reader, + std::shared_ptr pre_buffer_once, + std::vector pre_buffer_row_groups, + arrow::io::AsyncContext async_context, + arrow::io::CacheOptions cache_options, std::shared_ptr options, std::shared_ptr context) : ScanTask(std::move(options), std::move(context)), row_group_(row_group), column_projection_(std::move(column_projection)), - reader_(std::move(reader)) {} + reader_(std::move(reader)), + pre_buffer_once_(std::move(pre_buffer_once)), + pre_buffer_row_groups_(std::move(pre_buffer_row_groups)), + async_context_(async_context), + cache_options_(cache_options) {} Result Execute() override { // The construction of parquet's RecordBatchReader is deferred here to @@ -79,16 +88,41 @@ class ParquetScanTask : public ScanTask { std::unique_ptr record_batch_reader; } NextBatch; + RETURN_NOT_OK(EnsurePreBuffered()); NextBatch.file_reader = reader_; RETURN_NOT_OK(reader_->GetRecordBatchReader({row_group_}, column_projection_, &NextBatch.record_batch_reader)); return MakeFunctionIterator(std::move(NextBatch)); } + // Ensure that pre-buffering has been applied to the underlying Parquet reader + // exactly once (if needed). If we instead set pre_buffer on in the Arrow + // reader properties, each scan task will try to separately pre-buffer, which + // will lead to crashes as they trample the Parquet file reader's internal + // state. Instead, pre-buffer once at the file level. This also has the + // advantage that we can coalesce reads across row groups. + Status EnsurePreBuffered() { + if (pre_buffer_once_) { + BEGIN_PARQUET_CATCH_EXCEPTIONS + std::call_once(*pre_buffer_once_, [this]() { + reader_->parquet_reader()->PreBuffer(pre_buffer_row_groups_, column_projection_, + async_context_, cache_options_); + }); + END_PARQUET_CATCH_EXCEPTIONS + } + return Status::OK(); + } + private: int row_group_; std::vector column_projection_; std::shared_ptr reader_; + // Pre-buffering state. pre_buffer_once will be nullptr if no pre-buffering is + // to be done. We assume all scan tasks have the same column projection. + std::shared_ptr pre_buffer_once_; + std::vector pre_buffer_row_groups_; + arrow::io::AsyncContext async_context_; + arrow::io::CacheOptions cache_options_; }; static parquet::ReaderProperties MakeReaderProperties( @@ -320,9 +354,15 @@ Result ParquetFileFormat::ScanFile(std::shared_ptr pre_buffer_once = nullptr; + if (reader_options.pre_buffer) { + pre_buffer_once = std::make_shared(); + } + for (size_t i = 0; i < row_groups.size(); ++i) { - tasks[i] = std::make_shared(row_groups[i], column_projection, reader, - options, context); + tasks[i] = std::make_shared( + row_groups[i], column_projection, reader, pre_buffer_once, row_groups, + reader_options.async_context, reader_options.cache_options, options, context); } return MakeVectorIterator(std::move(tasks)); diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index ae0337994a083..6967ab30669f3 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -29,6 +29,7 @@ #include "arrow/dataset/file_base.h" #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" +#include "arrow/io/caching.h" #include "arrow/util/optional.h" namespace parquet { @@ -94,6 +95,9 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { /// /// @{ std::unordered_set dict_columns; + bool pre_buffer = false; + arrow::io::CacheOptions cache_options = arrow::io::CacheOptions::Defaults(); + arrow::io::AsyncContext async_context; /// @} /// EXPERIMENTAL: Parallelize conversion across columns. This option is ignored if a diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 3853921665071..e198d18a8a719 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -263,6 +263,34 @@ TEST_F(TestParquetFileFormat, ScanRecordBatchReaderDictEncoded) { ASSERT_EQ(row_count, kNumRows); } +TEST_F(TestParquetFileFormat, ScanRecordBatchReaderPreBuffer) { + auto reader = GetRecordBatchReader(); + auto source = GetFileSource(reader.get()); + + opts_ = ScanOptions::Make(reader->schema()); + SetFilter(literal(true)); + + format_->reader_options.pre_buffer = true; + ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source)); + ASSERT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_, ctx_)); + + int64_t task_count = 0; + int64_t row_count = 0; + + for (auto maybe_task : scan_task_it) { + ASSERT_OK_AND_ASSIGN(auto task, maybe_task); + task_count += 1; + ASSERT_OK_AND_ASSIGN(auto rb_it, task->Execute()); + for (auto maybe_batch : rb_it) { + ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); + row_count += batch->num_rows(); + } + } + + ASSERT_EQ(task_count, kBatchRepetitions); + ASSERT_EQ(row_count, kNumRows); +} + TEST_F(TestParquetFileFormat, OpenFailureWithRelevantError) { std::shared_ptr buf = std::make_shared(util::string_view("")); auto result = format_->Inspect(FileSource(buf)); diff --git a/cpp/src/arrow/flight/test_integration_client.cc b/cpp/src/arrow/flight/test_integration_client.cc index 8f331f926ef26..64da66564bc6e 100644 --- a/cpp/src/arrow/flight/test_integration_client.cc +++ b/cpp/src/arrow/flight/test_integration_client.cc @@ -203,6 +203,8 @@ class IntegrationTestScenario : public flight::Scenario { } // namespace arrow int main(int argc, char** argv) { + arrow::util::ArrowLog::InstallFailureSignalHandler(); + gflags::SetUsageMessage("Integration testing client for Flight."); gflags::ParseCommandLineFlags(&argc, &argv, true); std::shared_ptr scenario; @@ -222,5 +224,7 @@ int main(int argc, char** argv) { ABORT_NOT_OK(arrow::flight::Location::ForGrpcTcp(FLAGS_host, FLAGS_port, &location)); ABORT_NOT_OK(arrow::flight::FlightClient::Connect(location, options, &client)); ABORT_NOT_OK(scenario->RunClient(std::move(client))); + + arrow::util::ArrowLog::UninstallSignalAction(); return 0; } diff --git a/cpp/src/arrow/io/hdfs_internal.cc b/cpp/src/arrow/io/hdfs_internal.cc index ced298f732130..6d9f71cc1796a 100644 --- a/cpp/src/arrow/io/hdfs_internal.cc +++ b/cpp/src/arrow/io/hdfs_internal.cc @@ -173,7 +173,7 @@ Result> get_potential_libjvm_paths() { std::string file_name; // From heuristics -#ifdef __WIN32 +#ifdef _WIN32 ARROW_ASSIGN_OR_RAISE(search_prefixes, MakeFilenameVector({""})); ARROW_ASSIGN_OR_RAISE(search_suffixes, MakeFilenameVector({"/jre/bin/server", "/bin/server"})); diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 6569e71b454f8..906cb00ef0775 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -32,6 +32,7 @@ #include "arrow/ipc/options.h" #include "arrow/ipc/util.h" #include "arrow/status.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index f818aebab24d3..6a1f5ffe0ebc4 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1335,7 +1335,11 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr metadata; RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata)); - *out = ::arrow::schema(std::move(fields), metadata); + // set endianess using the value in flatbuf schema + auto endianness = schema->endianness() == flatbuf::Endianness::Little + ? Endianness::Little + : Endianness::Big; + *out = ::arrow::schema(std::move(fields), endianness, metadata); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/options.h b/cpp/src/arrow/ipc/options.h index aa939e24378f5..2e0f800b5ad51 100644 --- a/cpp/src/arrow/ipc/options.h +++ b/cpp/src/arrow/ipc/options.h @@ -137,6 +137,18 @@ struct ARROW_EXPORT IpcReadOptions { /// like decompression bool use_threads = true; + /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness + /// + /// If the endianness of the received schema is not equal to platform-native + /// endianness, then all buffers with endian-sensitive data will be byte-swapped. + /// This includes the value buffers of numeric types, temporal types, decimal + /// types, as well as the offset buffers of variable-sized binary and list-like + /// types. + /// + /// Endianness conversion is achieved by the RecordBatchFileReader, + /// RecordBatchStreamReader and StreamDecoder classes. + bool ensure_native_endian = true; + static IpcReadOptions Defaults(); }; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 82fb4c743a435..7e39ee1c484bc 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -46,6 +46,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" @@ -108,6 +109,30 @@ Status InvalidMessageType(MessageType expected, MessageType actual) { // ---------------------------------------------------------------------- // Record batch read path +/// \brief Structure to keep common arguments to be passed +struct IpcReadContext { + IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap, + MetadataVersion version = MetadataVersion::V5, + Compression::type kind = Compression::UNCOMPRESSED) + : dictionary_memo(memo), + options(option), + metadata_version(version), + compression(kind), + swap_endian(swap) {} + + DictionaryMemo* dictionary_memo; + + const IpcReadOptions& options; + + MetadataVersion metadata_version; + + Compression::type compression; + + /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements + /// if this flag is true + const bool swap_endian; +}; + /// The field_index and buffer_index are incremented based on how much of the /// batch is "consumed" (through nested data reconstruction, for example) class ArrayLoader { @@ -439,10 +464,9 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op Result> LoadRecordBatchSubset( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, - const std::vector* inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, MetadataVersion metadata_version, - Compression::type compression, io::RandomAccessFile* file) { - ArrayLoader loader(metadata, metadata_version, options, file); + const std::vector* inclusion_mask, const IpcReadContext& context, + io::RandomAccessFile* file) { + ArrayLoader loader(metadata, context.metadata_version, context.options, file); ArrayDataVector columns(schema->num_fields()); ArrayDataVector filtered_columns; @@ -472,7 +496,8 @@ Result> LoadRecordBatchSubset( // Dictionary resolution needs to happen on the unfiltered columns, // because fields are mapped structurally (by path in the original schema). - RETURN_NOT_OK(ResolveDictionaries(columns, *dictionary_memo, options.memory_pool)); + RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo, + context.options.memory_pool)); if (inclusion_mask) { filtered_schema = ::arrow::schema(std::move(filtered_fields), schema->metadata()); @@ -481,25 +506,30 @@ Result> LoadRecordBatchSubset( filtered_schema = schema; filtered_columns = std::move(columns); } - if (compression != Compression::UNCOMPRESSED) { - RETURN_NOT_OK(DecompressBuffers(compression, options, &filtered_columns)); + if (context.compression != Compression::UNCOMPRESSED) { + RETURN_NOT_OK( + DecompressBuffers(context.compression, context.options, &filtered_columns)); } + // swap endian in a set of ArrayData if necessary (swap_endian == true) + if (context.swap_endian) { + for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { + ARROW_ASSIGN_OR_RAISE(filtered_columns[i], + arrow::internal::SwapEndianArrayData(filtered_columns[i])); + } + } return RecordBatch::Make(filtered_schema, metadata->length(), std::move(filtered_columns)); } Result> LoadRecordBatch( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, - const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, MetadataVersion metadata_version, - Compression::type compression, io::RandomAccessFile* file) { + const std::vector& inclusion_mask, const IpcReadContext& context, + io::RandomAccessFile* file) { if (inclusion_mask.size() > 0) { - return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, dictionary_memo, - options, metadata_version, compression, file); + return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file); } else { - return LoadRecordBatchSubset(metadata, schema, nullptr, dictionary_memo, options, - metadata_version, compression, file); + return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file); } } @@ -577,8 +607,8 @@ Result> ReadRecordBatch( Result> ReadRecordBatchInternal( const Buffer& metadata, const std::shared_ptr& schema, - const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, io::RandomAccessFile* file) { + const std::vector& inclusion_mask, IpcReadContext& context, + io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); auto batch = message->header_as_RecordBatch(); @@ -589,15 +619,15 @@ Result> ReadRecordBatchInternal( Compression::type compression; RETURN_NOT_OK(GetCompression(batch, &compression)); - if (compression == Compression::UNCOMPRESSED && + if (context.compression == Compression::UNCOMPRESSED && message->version() == flatbuf::MetadataVersion::V4) { // Possibly obtain codec information from experimental serialization format // in 0.17.x RETURN_NOT_OK(GetCompressionExperimental(message, &compression)); } - return LoadRecordBatch(batch, schema, inclusion_mask, dictionary_memo, options, - internal::GetMetadataVersion(message->version()), compression, - file); + context.compression = compression; + context.metadata_version = internal::GetMetadataVersion(message->version()); + return LoadRecordBatch(batch, schema, inclusion_mask, context, file); } // If we are selecting only certain fields, populate an inclusion mask for fast lookups. @@ -630,7 +660,8 @@ Status GetInclusionMaskAndOutSchema(const std::shared_ptr& full_schema, included_fields.push_back(full_schema->field(i)); } - *out_schema = schema(std::move(included_fields), full_schema->metadata()); + *out_schema = schema(std::move(included_fields), full_schema->endianness(), + full_schema->metadata()); return Status::OK(); } @@ -638,25 +669,32 @@ Status UnpackSchemaMessage(const void* opaque_schema, const IpcReadOptions& opti DictionaryMemo* dictionary_memo, std::shared_ptr* schema, std::shared_ptr* out_schema, - std::vector* field_inclusion_mask) { + std::vector* field_inclusion_mask, bool* swap_endian) { RETURN_NOT_OK(internal::GetSchema(opaque_schema, dictionary_memo, schema)); // If we are selecting only certain fields, populate the inclusion mask now // for fast lookups - return GetInclusionMaskAndOutSchema(*schema, options.included_fields, - field_inclusion_mask, out_schema); + RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields, + field_inclusion_mask, out_schema)); + *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian(); + if (*swap_endian) { + // create a new schema with native endianness before swapping endian in ArrayData + *schema = schema->get()->WithEndianness(Endianness::Native); + *out_schema = out_schema->get()->WithEndianness(Endianness::Native); + } + return Status::OK(); } Status UnpackSchemaMessage(const Message& message, const IpcReadOptions& options, DictionaryMemo* dictionary_memo, std::shared_ptr* schema, std::shared_ptr* out_schema, - std::vector* field_inclusion_mask) { + std::vector* field_inclusion_mask, bool* swap_endian) { CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message.type()); CHECK_HAS_NO_BODY(message); return UnpackSchemaMessage(message.header(), options, dictionary_memo, schema, - out_schema, field_inclusion_mask); + out_schema, field_inclusion_mask, swap_endian); } Result> ReadRecordBatch( @@ -666,15 +704,14 @@ Result> ReadRecordBatch( std::shared_ptr out_schema; // Empty means do not use std::vector inclusion_mask; - RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, options.included_fields, + IpcReadContext context(const_cast(dictionary_memo), options, false); + RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields, &inclusion_mask, &out_schema)); - return ReadRecordBatchInternal(metadata, schema, inclusion_mask, dictionary_memo, - options, file); + return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file); } -Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, DictionaryKind* kind, - io::RandomAccessFile* file) { +Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, + DictionaryKind* kind, io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); const auto dictionary_batch = message->header_as_DictionaryBatch(); @@ -701,42 +738,46 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, // Look up the dictionary value type, which must have been added to the // DictionaryMemo already prior to invoking this function - ARROW_ASSIGN_OR_RAISE(auto value_type, dictionary_memo->GetDictionaryType(id)); + ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id)); // Load the dictionary data from the dictionary batch ArrayLoader loader(batch_meta, internal::GetMetadataVersion(message->version()), - options, file); - const auto dict_data = std::make_shared(); + context.options, file); + auto dict_data = std::make_shared(); const Field dummy_field("", value_type); RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get())); if (compression != Compression::UNCOMPRESSED) { ArrayDataVector dict_fields{dict_data}; - RETURN_NOT_OK(DecompressBuffers(compression, options, &dict_fields)); + RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields)); + } + + // swap endian in dict_data if necessary (swap_endian == true) + if (context.swap_endian) { + ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data)); } if (dictionary_batch->isDelta()) { if (kind != nullptr) { *kind = DictionaryKind::Delta; } - return dictionary_memo->AddDictionaryDelta(id, dict_data); + return context.dictionary_memo->AddDictionaryDelta(id, dict_data); } ARROW_ASSIGN_OR_RAISE(bool inserted, - dictionary_memo->AddOrReplaceDictionary(id, dict_data)); + context.dictionary_memo->AddOrReplaceDictionary(id, dict_data)); if (kind != nullptr) { *kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement; } return Status::OK(); } -Status ReadDictionary(const Message& message, DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, DictionaryKind* kind) { +Status ReadDictionary(const Message& message, const IpcReadContext& context, + DictionaryKind* kind) { // Only invoke this method if we already know we have a dictionary message DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH); CHECK_HAS_BODY(message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadDictionary(*message.metadata(), dictionary_memo, options, kind, - reader.get()); + return ReadDictionary(*message.metadata(), context, kind, reader.get()); } // ---------------------------------------------------------------------- @@ -755,8 +796,10 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { return Status::Invalid("Tried reading schema message, was null or length 0"); } - return UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, - &out_schema_, &field_inclusion_mask_); + RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, + &out_schema_, &field_inclusion_mask_, + &swap_endian_)); + return Status::OK(); } Status ReadNext(std::shared_ptr* batch) override { @@ -788,8 +831,9 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get()) + context, reader.get()) .Value(batch); } @@ -819,8 +863,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { // Read dictionary from dictionary batch Status ReadDictionary(const Message& message) { DictionaryKind kind; - RETURN_NOT_OK( - ::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, &kind)); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind)); switch (kind) { case DictionaryKind::New: break; @@ -886,6 +930,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { DictionaryMemo dictionary_memo_; std::shared_ptr schema_, out_schema_; + + bool swap_endian_; }; // ---------------------------------------------------------------------- @@ -941,10 +987,10 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - ARROW_ASSIGN_OR_RAISE( - auto batch, - ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get())); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal( + *message->metadata(), schema_, + field_inclusion_mask_, context, reader.get())); ++stats_.num_record_batches; return batch; } @@ -964,7 +1010,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Get the schema and record any observed dictionaries RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_, - &schema_, &out_schema_, &field_inclusion_mask_)); + &schema_, &out_schema_, &field_inclusion_mask_, + &swap_endian_)); ++stats_.num_messages; return Status::OK(); } @@ -1008,8 +1055,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; - RETURN_NOT_OK(ReadDictionary(*message->metadata(), &dictionary_memo_, options_, - &kind, reader.get())); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get())); ++stats_.num_dictionary_batches; if (kind != DictionaryKind::New) { return Status::Invalid( @@ -1097,6 +1144,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { std::shared_ptr out_schema_; ReadStats stats_; + + bool swap_endian_; }; Result> RecordBatchFileReader::Open( @@ -1192,7 +1241,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { private: Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_, - &out_schema_, &field_inclusion_mask_)); + &out_schema_, &field_inclusion_mask_, + &swap_endian_)); n_required_dictionaries_ = dictionary_memo_.fields().num_fields(); if (n_required_dictionaries_ == 0) { @@ -1220,15 +1270,17 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { } Status OnRecordBatchMessageDecoded(std::unique_ptr message) { + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); if (message->type() == MessageType::DICTIONARY_BATCH) { return ReadDictionary(*message); } else { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); ARROW_ASSIGN_OR_RAISE( auto batch, ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get())); + context, reader.get())); ++stats_.num_record_batches; return listener_->OnRecordBatchDecoded(std::move(batch)); } @@ -1237,8 +1289,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { // Read dictionary from dictionary batch Status ReadDictionary(const Message& message) { DictionaryKind kind; - RETURN_NOT_OK( - ::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, &kind)); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind)); ++stats_.num_dictionary_batches; switch (kind) { case DictionaryKind::New: @@ -1262,6 +1314,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { DictionaryMemo dictionary_memo_; std::shared_ptr schema_, out_schema_; ReadStats stats_; + bool swap_endian_; }; StreamDecoder::StreamDecoder(std::shared_ptr listener, IpcReadOptions options) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index a0f61ba9d94de..ab10238850c97 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -1045,9 +1045,9 @@ Status MakeRandomTensor(const std::shared_ptr& type, const auto& element_type = internal::checked_cast(*type); std::vector strides; if (row_major_p) { - internal::ComputeRowMajorStrides(element_type, shape, &strides); + RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); } else { - internal::ComputeColumnMajorStrides(element_type, shape, &strides); + RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape, &strides)); } const int64_t element_size = element_type.bit_width() / CHAR_BIT; diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index ac866daa8d2bf..c14ff5ec9bc1e 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -49,6 +49,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" diff --git a/cpp/src/arrow/json/reader.cc b/cpp/src/arrow/json/reader.cc index dc0d6e04d11aa..44aa2607d9ee7 100644 --- a/cpp/src/arrow/json/reader.cc +++ b/cpp/src/arrow/json/reader.cc @@ -29,6 +29,7 @@ #include "arrow/json/parser.h" #include "arrow/record_batch.h" #include "arrow/table.h" +#include "arrow/util/async_generator.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/public_api_test.cc b/cpp/src/arrow/public_api_test.cc index 875d07d81527b..25e43d3b9b317 100644 --- a/cpp/src/arrow/public_api_test.cc +++ b/cpp/src/arrow/public_api_test.cc @@ -30,10 +30,6 @@ #error "ASSIGN_OR_RAISE should not be visible from Arrow public headers." #endif -#ifdef ARROW_UTIL_PARALLEL_H -#error "arrow/util/parallel.h is an internal header" -#endif - #include #include diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index 6504d950674a0..0172a85243470 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -317,7 +317,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable> { return ValueUnsafe(); } const T& operator*() const& { return ValueOrDie(); } - const T* operator->() const& { return &ValueOrDie(); } + const T* operator->() const { return &ValueOrDie(); } /// Gets a mutable reference to the stored `T` value. /// @@ -332,7 +332,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable> { return ValueUnsafe(); } T& operator*() & { return ValueOrDie(); } - T* operator->() & { return &ValueOrDie(); } + T* operator->() { return &ValueOrDie(); } /// Moves and returns the internally-stored `T` value. /// diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 06fc6783ff35e..ee4d0ecad8fea 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -516,20 +516,6 @@ Status CastImpl(const DateScalar& from, TimestampScalar* to) { .Value(&to->value); } -// timestamp to string -Status CastImpl(const TimestampScalar& from, StringScalar* to) { - to->value = FormatToBuffer(internal::StringFormatter{}, from); - return Status::OK(); -} - -// date to string -template -Status CastImpl(const DateScalar& from, StringScalar* to) { - TimestampScalar ts({}, timestamp(TimeUnit::MILLI)); - RETURN_NOT_OK(CastImpl(from, &ts)); - return CastImpl(ts, to); -} - // string to any template Status CastImpl(const StringScalar& from, ScalarType* to) { @@ -556,6 +542,18 @@ Status CastImpl(const ScalarType& from, StringScalar* to) { return Status::OK(); } +Status CastImpl(const Decimal128Scalar& from, StringScalar* to) { + auto from_type = checked_cast(from.type.get()); + to->value = Buffer::FromString(from.value.ToString(from_type->scale())); + return Status::OK(); +} + +Status CastImpl(const Decimal256Scalar& from, StringScalar* to) { + auto from_type = checked_cast(from.type.get()); + to->value = Buffer::FromString(from.value.ToString(from_type->scale())); + return Status::OK(); +} + struct CastImplVisitor { Status NotImplemented() { return Status::NotImplemented("cast to ", *to_type_, " from ", *from_.type); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 30a39e6e4c031..16c2f92d13b30 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -660,7 +660,7 @@ TEST(TestTimestampScalars, Cast) { ASSERT_OK_AND_ASSIGN(auto str, TimestampScalar(1024, timestamp(TimeUnit::MILLI)).CastTo(utf8())); - EXPECT_EQ(*str, StringScalar("1024")); + EXPECT_EQ(*str, StringScalar("1970-01-01 00:00:01.024")); ASSERT_OK_AND_ASSIGN(auto i64, TimestampScalar(1024, timestamp(TimeUnit::MILLI)).CastTo(int64())); EXPECT_EQ(*i64, Int64Scalar(1024)); diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 894a94c40cfa9..d591bacff02da 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -31,6 +31,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -40,40 +41,68 @@ using internal::checked_cast; namespace internal { -void ComputeRowMajorStrides(const FixedWidthType& type, const std::vector& shape, - std::vector* strides) { +Status ComputeRowMajorStrides(const FixedWidthType& type, + const std::vector& shape, + std::vector* strides) { const int byte_width = GetByteWidth(type); - int64_t remaining = byte_width; - for (int64_t dimsize : shape) { - remaining *= dimsize; + const size_t ndim = shape.size(); + + int64_t remaining = 0; + if (!shape.empty() && shape.front() > 0) { + remaining = byte_width; + for (size_t i = 1; i < ndim; ++i) { + if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) { + return Status::Invalid( + "Row-major strides computed from shape would not fit in 64-bit integer"); + } + } } if (remaining == 0) { strides->assign(shape.size(), byte_width); - return; + return Status::OK(); } - for (int64_t dimsize : shape) { - remaining /= dimsize; + strides->push_back(remaining); + for (size_t i = 1; i < ndim; ++i) { + remaining /= shape[i]; strides->push_back(remaining); } + + return Status::OK(); } -void ComputeColumnMajorStrides(const FixedWidthType& type, - const std::vector& shape, - std::vector* strides) { +Status ComputeColumnMajorStrides(const FixedWidthType& type, + const std::vector& shape, + std::vector* strides) { const int byte_width = internal::GetByteWidth(type); - int64_t total = byte_width; - for (int64_t dimsize : shape) { - if (dimsize == 0) { - strides->assign(shape.size(), byte_width); - return; + const size_t ndim = shape.size(); + + int64_t total = 0; + if (!shape.empty() && shape.back() > 0) { + total = byte_width; + for (size_t i = 0; i < ndim - 1; ++i) { + if (internal::MultiplyWithOverflow(total, shape[i], &total)) { + return Status::Invalid( + "Column-major strides computed from shape would not fit in 64-bit " + "integer"); + } } } - for (int64_t dimsize : shape) { + + if (total == 0) { + strides->assign(shape.size(), byte_width); + return Status::OK(); + } + + total = byte_width; + for (size_t i = 0; i < ndim - 1; ++i) { strides->push_back(total); - total *= dimsize; + total *= shape[i]; } + strides->push_back(total); + + return Status::OK(); } } // namespace internal @@ -85,8 +114,11 @@ inline bool IsTensorStridesRowMajor(const std::shared_ptr& type, const std::vector& strides) { std::vector c_strides; const auto& fw_type = checked_cast(*type); - internal::ComputeRowMajorStrides(fw_type, shape, &c_strides); - return strides == c_strides; + if (internal::ComputeRowMajorStrides(fw_type, shape, &c_strides).ok()) { + return strides == c_strides; + } else { + return false; + } } inline bool IsTensorStridesColumnMajor(const std::shared_ptr& type, @@ -94,8 +126,11 @@ inline bool IsTensorStridesColumnMajor(const std::shared_ptr& type, const std::vector& strides) { std::vector f_strides; const auto& fw_type = checked_cast(*type); - internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides); - return strides == f_strides; + if (internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides).ok()) { + return strides == f_strides; + } else { + return false; + } } inline Status CheckTensorValidity(const std::shared_ptr& type, @@ -127,14 +162,29 @@ Status CheckTensorStridesValidity(const std::shared_ptr& data, return Status::OK(); } - std::vector last_index(shape); - const int64_t n = static_cast(shape.size()); - for (int64_t i = 0; i < n; ++i) { - --last_index[i]; + // Check the largest offset can be computed without overflow + const size_t ndim = shape.size(); + int64_t largest_offset = 0; + for (size_t i = 0; i < ndim; ++i) { + if (shape[i] == 0) continue; + if (strides[i] < 0) { + // TODO(mrkn): Support negative strides for sharing views + return Status::Invalid("negative strides not supported"); + } + + int64_t dim_offset; + if (!internal::MultiplyWithOverflow(shape[i] - 1, strides[i], &dim_offset)) { + if (!internal::AddWithOverflow(largest_offset, dim_offset, &largest_offset)) { + continue; + } + } + + return Status::Invalid( + "offsets computed from shape and strides would not fit in 64-bit integer"); } - int64_t last_offset = Tensor::CalculateValueOffset(strides, last_index); + const int byte_width = internal::GetByteWidth(*type); - if (last_offset + byte_width > data->size()) { + if (largest_offset > data->size() - byte_width) { return Status::Invalid("strides must not involve buffer over run"); } return Status::OK(); @@ -159,6 +209,10 @@ Status ValidateTensorParameters(const std::shared_ptr& type, RETURN_NOT_OK(CheckTensorValidity(type, data, shape)); if (!strides.empty()) { RETURN_NOT_OK(CheckTensorStridesValidity(data, shape, strides, type)); + } else { + std::vector tmp_strides; + RETURN_NOT_OK(ComputeRowMajorStrides(checked_cast(*type), + shape, &tmp_strides)); } if (dim_names.size() > shape.size()) { return Status::Invalid("too many dim_names are supplied"); @@ -175,8 +229,8 @@ Tensor::Tensor(const std::shared_ptr& type, const std::shared_ptrid())); if (shape.size() > 0 && strides.size() == 0) { - internal::ComputeRowMajorStrides(checked_cast(*type_), shape, - &strides_); + ARROW_CHECK_OK(internal::ComputeRowMajorStrides( + checked_cast(*type_), shape, &strides_)); } } diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index 22da07a16edd1..91e9ad26066f0 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -56,13 +56,14 @@ static inline bool is_tensor_supported(Type::type type_id) { namespace internal { ARROW_EXPORT -void ComputeRowMajorStrides(const FixedWidthType& type, const std::vector& shape, - std::vector* strides); +Status ComputeRowMajorStrides(const FixedWidthType& type, + const std::vector& shape, + std::vector* strides); ARROW_EXPORT -void ComputeColumnMajorStrides(const FixedWidthType& type, - const std::vector& shape, - std::vector* strides); +Status ComputeColumnMajorStrides(const FixedWidthType& type, + const std::vector& shape, + std::vector* strides); ARROW_EXPORT bool IsTensorStridesContiguous(const std::shared_ptr& type, diff --git a/cpp/src/arrow/tensor/coo_converter.cc b/cpp/src/arrow/tensor/coo_converter.cc index 6ef0361cb0fe9..2124d0a4e4b79 100644 --- a/cpp/src/arrow/tensor/coo_converter.cc +++ b/cpp/src/arrow/tensor/coo_converter.cc @@ -213,9 +213,9 @@ class SparseCOOTensorConverter : private SparseTensorConverterMixin { // make results const std::vector indices_shape = {nonzero_count, ndim}; std::vector indices_strides; - internal::ComputeRowMajorStrides( + RETURN_NOT_OK(internal::ComputeRowMajorStrides( checked_cast(*index_value_type_), indices_shape, - &indices_strides); + &indices_strides)); auto coords = std::make_shared(index_value_type_, std::move(indices_buffer), indices_shape, indices_strides); ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true)); @@ -305,7 +305,7 @@ Result> MakeTensorFromSparseCOOTensor( std::fill_n(values, value_elsize * sparse_tensor->size(), 0); std::vector strides; - ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides); + RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides)); const auto* raw_data = sparse_tensor->raw_data(); const int ndim = sparse_tensor->ndim(); diff --git a/cpp/src/arrow/tensor/csf_converter.cc b/cpp/src/arrow/tensor/csf_converter.cc index 2d1c13566213d..77a71d8a12e4b 100644 --- a/cpp/src/arrow/tensor/csf_converter.cc +++ b/cpp/src/arrow/tensor/csf_converter.cc @@ -211,7 +211,7 @@ class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin { } Result> Build() { - internal::ComputeRowMajorStrides(value_type_, shape_, &strides_); + RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_)); ARROW_ASSIGN_OR_RAISE(values_buffer_, AllocateBuffer(value_elsize_ * tensor_size_, pool_)); diff --git a/cpp/src/arrow/tensor/csx_converter.cc b/cpp/src/arrow/tensor/csx_converter.cc index 5ce99d4c3e658..137b5d3202f22 100644 --- a/cpp/src/arrow/tensor/csx_converter.cc +++ b/cpp/src/arrow/tensor/csx_converter.cc @@ -177,7 +177,7 @@ Result> MakeTensorFromSparseCSXMatrix( std::fill_n(values, value_elsize * tensor_size, 0); std::vector strides; - ComputeRowMajorStrides(fw_value_type, shape, &strides); + RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides)); const auto nc = shape[1]; diff --git a/cpp/src/arrow/tensor_test.cc b/cpp/src/arrow/tensor_test.cc index 42c5540f6b5ff..efb1b8d9232b7 100644 --- a/cpp/src/arrow/tensor_test.cc +++ b/cpp/src/arrow/tensor_test.cc @@ -24,6 +24,7 @@ #include #include +#include #include #include "arrow/buffer.h" @@ -38,6 +39,96 @@ void AssertCountNonZero(const Tensor& t, int64_t expected) { ASSERT_EQ(count, expected); } +TEST(TestComputeRowMajorStrides, ZeroDimension) { + std::vector strides; + + std::vector shape1 = {0, 2, 3}; + ASSERT_OK(arrow::internal::ComputeRowMajorStrides(DoubleType(), shape1, &strides)); + EXPECT_THAT(strides, + testing::ElementsAre(sizeof(double), sizeof(double), sizeof(double))); + + std::vector shape2 = {2, 0, 3}; + strides.clear(); + ASSERT_OK(arrow::internal::ComputeRowMajorStrides(DoubleType(), shape2, &strides)); + EXPECT_THAT(strides, + testing::ElementsAre(sizeof(double), sizeof(double), sizeof(double))); + + std::vector shape3 = {2, 3, 0}; + strides.clear(); + ASSERT_OK(arrow::internal::ComputeRowMajorStrides(DoubleType(), shape3, &strides)); + EXPECT_THAT(strides, + testing::ElementsAre(sizeof(double), sizeof(double), sizeof(double))); +} + +TEST(TestComputeRowMajorStrides, MaximumSize) { + constexpr uint64_t total_length = + 1 + static_cast(std::numeric_limits::max()); + std::vector shape = {2, 2, static_cast(total_length / 4)}; + + std::vector strides; + ASSERT_OK(arrow::internal::ComputeRowMajorStrides(Int8Type(), shape, &strides)); + EXPECT_THAT(strides, testing::ElementsAre(2 * shape[2], shape[2], 1)); +} + +TEST(TestComputeRowMajorStrides, OverflowCase) { + constexpr uint64_t total_length = + 1 + static_cast(std::numeric_limits::max()); + std::vector shape = {2, 2, static_cast(total_length / 4)}; + + std::vector strides; + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr( + "Row-major strides computed from shape would not fit in 64-bit integer"), + arrow::internal::ComputeRowMajorStrides(Int16Type(), shape, &strides)); + EXPECT_EQ(0, strides.size()); +} + +TEST(TestComputeColumnMajorStrides, ZeroDimension) { + std::vector strides; + + std::vector shape1 = {0, 2, 3}; + ASSERT_OK(arrow::internal::ComputeColumnMajorStrides(DoubleType(), shape1, &strides)); + EXPECT_THAT(strides, + testing::ElementsAre(sizeof(double), sizeof(double), sizeof(double))); + + std::vector shape2 = {2, 0, 3}; + strides.clear(); + ASSERT_OK(arrow::internal::ComputeColumnMajorStrides(DoubleType(), shape2, &strides)); + EXPECT_THAT(strides, + testing::ElementsAre(sizeof(double), sizeof(double), sizeof(double))); + + std::vector shape3 = {2, 3, 0}; + strides.clear(); + ASSERT_OK(arrow::internal::ComputeColumnMajorStrides(DoubleType(), shape3, &strides)); + EXPECT_THAT(strides, + testing::ElementsAre(sizeof(double), sizeof(double), sizeof(double))); +} + +TEST(TestComputeColumnMajorStrides, MaximumSize) { + constexpr uint64_t total_length = + 1 + static_cast(std::numeric_limits::max()); + std::vector shape = {static_cast(total_length / 4), 2, 2}; + + std::vector strides; + ASSERT_OK(arrow::internal::ComputeColumnMajorStrides(Int8Type(), shape, &strides)); + EXPECT_THAT(strides, testing::ElementsAre(1, shape[0], 2 * shape[0])); +} + +TEST(TestComputeColumnMajorStrides, OverflowCase) { + constexpr uint64_t total_length = + 1 + static_cast(std::numeric_limits::max()); + std::vector shape = {static_cast(total_length / 4), 2, 2}; + + std::vector strides; + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr( + "Column-major strides computed from shape would not fit in 64-bit integer"), + arrow::internal::ComputeColumnMajorStrides(Int16Type(), shape, &strides)); + EXPECT_EQ(0, strides.size()); +} + TEST(TestTensor, MakeRowMajor) { std::vector shape = {3, 6}; std::vector strides = {sizeof(double) * 6, sizeof(double)}; @@ -152,6 +243,20 @@ TEST(TestTensor, MakeFailureCases) { // negative items in shape ASSERT_RAISES(Invalid, Tensor::Make(float64(), data, {-3, 6})); + // overflow in positive strides computation + constexpr uint64_t total_length = + 1 + static_cast(std::numeric_limits::max()); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr( + "Row-major strides computed from shape would not fit in 64-bit integer"), + Tensor::Make(float64(), data, {2, 2, static_cast(total_length / 4)})); + + // negative strides are prohibited + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("negative strides not supported"), + Tensor::Make(float64(), data, {18}, {-(int)sizeof(double)})); + // invalid stride length ASSERT_RAISES(Invalid, Tensor::Make(float64(), data, shape, {sizeof(double)})); ASSERT_RAISES(Invalid, Tensor::Make(float64(), data, shape, diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index cdb23a92899f2..718d2a3156a4c 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -44,16 +44,14 @@ // NOTE: failing must be inline in the macros below, to get correct file / line number // reporting on test failures. -#define ASSERT_RAISES(ENUM, expr) \ - do { \ - auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ - if (!_st.Is##ENUM()) { \ - FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ - ENUM) ", but got " \ - << _st.ToString(); \ - } \ - } while (false) +// NOTE: using a for loop for this macro allows extra failure messages to be +// appended with operator<< +#define ASSERT_RAISES(ENUM, expr) \ + for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); \ + !_st.Is##ENUM();) \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << _st.ToString() #define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ do { \ @@ -135,15 +133,55 @@ ASSERT_EQ(expected, _actual); \ } while (0) +// This macro should be called by futures that are expected to +// complete pretty quickly. 2 seconds is the default max wait +// here. Anything longer than that and it's a questionable +// unit test anyways. +#define ASSERT_FINISHES_IMPL(fut) \ + do { \ + ASSERT_TRUE(fut.Wait(10)); \ + if (!fut.is_finished()) { \ + FAIL() << "Future did not finish in a timely fashion"; \ + } \ + } while (false) + +#define ASSERT_FINISHES_OK(expr) \ + do { \ + auto&& _fut = (expr); \ + ASSERT_TRUE(_fut.Wait(10)); \ + if (!_fut.is_finished()) { \ + FAIL() << "Future did not finish in a timely fashion"; \ + } \ + auto _st = _fut.status(); \ + if (!_st.ok()) { \ + FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \ + } \ + } while (false) + +#define ASSERT_FINISHES_ERR(ENUM, expr) \ + do { \ + auto&& fut = (expr); \ + ASSERT_FINISHES_IMPL(fut); \ + ASSERT_RAISES(ENUM, fut.status()); \ + } while (false) + +#define ASSERT_FINISHES_OK_AND_ASSIGN_IMPL(lhs, rexpr, future_name) \ + auto future_name = (rexpr); \ + ASSERT_FINISHES_IMPL(future_name); \ + ASSERT_OK_AND_ASSIGN(lhs, future_name.result()); + +#define ASSERT_FINISHES_OK_AND_ASSIGN(lhs, rexpr) \ + ASSERT_FINISHES_OK_AND_ASSIGN_IMPL(lhs, rexpr, \ + ARROW_ASSIGN_OR_RAISE_NAME(_fut, __COUNTER__)) + namespace arrow { +// ---------------------------------------------------------------------- +// Useful testing::Types declarations inline void PrintTo(StatusCode code, std::ostream* os) { *os << Status::CodeAsString(code); } -// ---------------------------------------------------------------------- -// Useful testing::Types declarations - using NumericArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 12d3951865f78..9192c325bbf3e 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1298,29 +1298,56 @@ void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); } // ---------------------------------------------------------------------- // Schema implementation +std::string EndiannessToString(Endianness endianness) { + switch (endianness) { + case Endianness::Little: + return "little"; + case Endianness::Big: + return "big"; + default: + DCHECK(false) << "invalid endianness"; + return "???"; + } +} + class Schema::Impl { public: - Impl(std::vector> fields, + Impl(std::vector> fields, Endianness endianness, std::shared_ptr metadata) : fields_(std::move(fields)), + endianness_(endianness), name_to_index_(CreateNameToIndexMap(fields_)), metadata_(std::move(metadata)) {} std::vector> fields_; + Endianness endianness_; std::unordered_multimap name_to_index_; std::shared_ptr metadata_; }; +Schema::Schema(std::vector> fields, Endianness endianness, + std::shared_ptr metadata) + : detail::Fingerprintable(), + impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {} + Schema::Schema(std::vector> fields, std::shared_ptr metadata) : detail::Fingerprintable(), - impl_(new Impl(std::move(fields), std::move(metadata))) {} + impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {} Schema::Schema(const Schema& schema) : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} Schema::~Schema() = default; +std::shared_ptr Schema::WithEndianness(Endianness endianness) const { + return std::make_shared(impl_->fields_, endianness, impl_->metadata_); +} + +Endianness Schema::endianness() const { return impl_->endianness_; } + +bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; } + int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } const std::shared_ptr& Schema::field(int i) const { @@ -1338,6 +1365,11 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const { return true; } + // checks endianness equality + if (endianness() != other.endianness()) { + return false; + } + // checks field equality if (num_fields() != other.num_fields()) { return false; @@ -1482,6 +1514,10 @@ std::string Schema::ToString(bool show_metadata) const { ++i; } + if (impl_->endianness_ != Endianness::Native) { + buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --"; + } + if (show_metadata && HasMetadata()) { buffer << impl_->metadata_->ToString(); } @@ -1661,6 +1697,12 @@ std::shared_ptr schema(std::vector> fields, return std::make_shared(std::move(fields), std::move(metadata)); } +std::shared_ptr schema(std::vector> fields, + Endianness endianness, + std::shared_ptr metadata) { + return std::make_shared(std::move(fields), endianness, std::move(metadata)); +} + Result> UnifySchemas( const std::vector>& schemas, const Field::MergeOptions field_merge_options) { @@ -1819,6 +1861,7 @@ std::string Schema::ComputeFingerprint() const { } ss << field_fingerprint << ";"; } + ss << (endianness() == Endianness::Little ? "L" : "B"); ss << "}"; return ss.str(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 56718b7c5122e..0672354ab6cd2 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -30,6 +30,7 @@ #include "arrow/result.h" #include "arrow/type_fwd.h" // IWYU pragma: export #include "arrow/util/checked_cast.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" #include "arrow/util/variant.h" #include "arrow/util/visibility.h" @@ -1604,6 +1605,16 @@ class ARROW_EXPORT FieldRef { // ---------------------------------------------------------------------- // Schema +enum class Endianness { + Little = 0, + Big = 1, +#if ARROW_LITTLE_ENDIAN + Native = Little +#else + Native = Big +#endif +}; + /// \class Schema /// \brief Sequence of arrow::Field objects describing the columns of a record /// batch or table data structure @@ -1611,6 +1622,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, public util::EqualityComparable, public util::ToStringOstreamable { public: + explicit Schema(std::vector> fields, Endianness endianness, + std::shared_ptr metadata = NULLPTR); + explicit Schema(std::vector> fields, std::shared_ptr metadata = NULLPTR); @@ -1622,6 +1636,17 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, bool Equals(const Schema& other, bool check_metadata = false) const; bool Equals(const std::shared_ptr& other, bool check_metadata = false) const; + /// \brief Set endianness in the schema + /// + /// \return new Schema + std::shared_ptr WithEndianness(Endianness endianness) const; + + /// \brief Return endianness in the schema + Endianness endianness() const; + + /// \brief Indicate if endianness is equal to platform-native endianness + bool is_native_endian() const; + /// \brief Return the number of fields (columns) in the schema int num_fields() const; @@ -1690,6 +1715,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, std::unique_ptr impl_; }; +ARROW_EXPORT +std::string EndiannessToString(Endianness endianness); + // ---------------------------------------------------------------------- /// \brief Convenience class to incrementally construct/merge schemas. diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index f1000d1fe7fb7..14329675c8f1b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -52,6 +52,7 @@ class DataType; class Field; class FieldRef; class KeyValueMetadata; +enum class Endianness; class Schema; using DataTypeVector = std::vector>; @@ -635,6 +636,17 @@ std::shared_ptr schema( std::vector> fields, std::shared_ptr metadata = NULLPTR); +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields +/// \param endianness the endianness of the data +/// \param metadata any custom key-value metadata, default null +/// \return schema shared_ptr to Schema +ARROW_EXPORT +std::shared_ptr schema( + std::vector> fields, Endianness endianness, + std::shared_ptr metadata = NULLPTR); + /// @} /// Return the process-wide default memory pool. diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 81a0315d6d178..da93e32936cd5 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -475,6 +475,31 @@ TEST_F(TestSchema, Basics) { ASSERT_EQ(schema->fingerprint(), schema2->fingerprint()); ASSERT_NE(schema->fingerprint(), schema3->fingerprint()); + + auto schema4 = ::arrow::schema({f0}, Endianness::Little); + auto schema5 = ::arrow::schema({f0}, Endianness::Little); + auto schema6 = ::arrow::schema({f0}, Endianness::Big); + auto schema7 = ::arrow::schema({f0}); + + AssertSchemaEqual(schema4, schema5); + AssertSchemaNotEqual(schema4, schema6); +#if ARROW_LITTLE_ENDIAN + AssertSchemaEqual(schema4, schema7); + AssertSchemaNotEqual(schema6, schema7); +#else + AssertSchemaNotEqual(schema4, schema6); + AssertSchemaEqual(schema6, schema7); +#endif + + ASSERT_EQ(schema4->fingerprint(), schema5->fingerprint()); + ASSERT_NE(schema4->fingerprint(), schema6->fingerprint()); +#if ARROW_LITTLE_ENDIAN + ASSERT_EQ(schema4->fingerprint(), schema7->fingerprint()); + ASSERT_NE(schema6->fingerprint(), schema7->fingerprint()); +#else + ASSERT_NE(schema4->fingerprint(), schema7->fingerprint()); + ASSERT_EQ(schema6->fingerprint(), schema7->fingerprint()); +#endif } TEST_F(TestSchema, ToString) { @@ -495,14 +520,38 @@ f3: list)"; ASSERT_EQ(expected, result); result = schema->ToString(/*print_metadata=*/true); + std::string expected_with_metadata = expected + R"( +-- metadata -- +foo: bar)"; + + ASSERT_EQ(expected_with_metadata, result); + + // With swapped endianness +#if ARROW_LITTLE_ENDIAN + schema = schema->WithEndianness(Endianness::Big); + expected = R"(f0: int32 +f1: uint8 not null +f2: string +f3: list +-- endianness: big --)"; +#else + schema = schema->WithEndianness(Endianness::Little); expected = R"(f0: int32 f1: uint8 not null f2: string f3: list +-- endianness: little --)"; +#endif + + result = schema->ToString(); + ASSERT_EQ(expected, result); + + result = schema->ToString(/*print_metadata=*/true); + expected_with_metadata = expected + R"( -- metadata -- foo: bar)"; - ASSERT_EQ(expected, result); + ASSERT_EQ(expected_with_metadata, result); } TEST_F(TestSchema, GetFieldByName) { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index e872a31f31d94..b74aa3b0adbcb 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -991,4 +991,22 @@ static inline bool is_nested(Type::type type_id) { return false; } +static inline int offset_bit_width(Type::type type_id) { + switch (type_id) { + case Type::STRING: + case Type::BINARY: + case Type::LIST: + case Type::MAP: + case Type::DENSE_UNION: + return 32; + case Type::LARGE_STRING: + case Type::LARGE_BINARY: + case Type::LARGE_LIST: + return 64; + default: + break; + } + return 0; +} + } // namespace arrow diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index f5c658d08f2c0..718307deedf86 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -43,6 +43,7 @@ add_arrow_test(utility-test align_util_test.cc bit_block_counter_test.cc bit_util_test.cc + cache_test.cc checked_cast_test.cc compression_test.cc decimal_test.cc @@ -73,6 +74,7 @@ add_arrow_test(threading-utility-test add_arrow_benchmark(bit_block_counter_benchmark) add_arrow_benchmark(bit_util_benchmark) +add_arrow_benchmark(cache_benchmark) add_arrow_benchmark(compression_benchmark) add_arrow_benchmark(decimal_benchmark) add_arrow_benchmark(hashing_benchmark) diff --git a/cpp/src/arrow/util/async_generator.h b/cpp/src/arrow/util/async_generator.h new file mode 100644 index 0000000000000..424810c523a9d --- /dev/null +++ b/cpp/src/arrow/util/async_generator.h @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include + +#include "arrow/util/functional.h" +#include "arrow/util/future.h" +#include "arrow/util/iterator.h" +#include "arrow/util/optional.h" +#include "arrow/util/thread_pool.h" + +namespace arrow { + +template +using AsyncGenerator = std::function()>; + +/// Iterates through a generator of futures, visiting the result of each one and +/// returning a future that completes when all have been visited +template +Future<> VisitAsyncGenerator(AsyncGenerator generator, + std::function visitor) { + struct LoopBody { + struct Callback { + Result> operator()(const T& result) { + if (result == IterationTraits::End()) { + return Break(detail::Empty()); + } else { + auto visited = visitor(result); + if (visited.ok()) { + return Continue(); + } else { + return visited; + } + } + } + + std::function visitor; + }; + + Future> operator()() { + Callback callback{visitor}; + auto next = generator(); + return next.Then(std::move(callback)); + } + + AsyncGenerator generator; + std::function visitor; + }; + + return Loop(LoopBody{std::move(generator), std::move(visitor)}); +} + +template +Future> CollectAsyncGenerator(AsyncGenerator generator) { + auto vec = std::make_shared>(); + struct LoopBody { + Future>> operator()() { + auto next = generator(); + auto vec = vec_; + return next.Then([vec](const T& result) -> Result>> { + if (result == IterationTraits::End()) { + return Break(*vec); + } else { + vec->push_back(result); + return Continue(); + } + }); + } + AsyncGenerator generator; + std::shared_ptr> vec_; + }; + return Loop(LoopBody{std::move(generator), std::move(vec)}); +} + +template +class TransformingGenerator { + // The transforming generator state will be referenced as an async generator but will + // also be referenced via callback to various futures. If the async generator owner + // moves it around we need the state to be consistent for future callbacks. + struct TransformingGeneratorState + : std::enable_shared_from_this { + TransformingGeneratorState(AsyncGenerator generator, Transformer transformer) + : generator_(std::move(generator)), + transformer_(std::move(transformer)), + last_value_(), + finished_() {} + + Future operator()() { + while (true) { + auto maybe_next_result = Pump(); + if (!maybe_next_result.ok()) { + return Future::MakeFinished(maybe_next_result.status()); + } + auto maybe_next = std::move(maybe_next_result).ValueUnsafe(); + if (maybe_next.has_value()) { + return Future::MakeFinished(*std::move(maybe_next)); + } + + auto next_fut = generator_(); + // If finished already, process results immediately inside the loop to avoid stack + // overflow + if (next_fut.is_finished()) { + auto next_result = next_fut.result(); + if (next_result.ok()) { + last_value_ = *next_result; + } else { + return Future::MakeFinished(next_result.status()); + } + // Otherwise, if not finished immediately, add callback to process results + } else { + auto self = this->shared_from_this(); + return next_fut.Then([self](const Result& next_result) { + if (next_result.ok()) { + self->last_value_ = *next_result; + return (*self)(); + } else { + return Future::MakeFinished(next_result.status()); + } + }); + } + } + } + + // See comment on TransformingIterator::Pump + Result> Pump() { + if (!finished_ && last_value_.has_value()) { + ARROW_ASSIGN_OR_RAISE(TransformFlow next, transformer_(*last_value_)); + if (next.ReadyForNext()) { + if (*last_value_ == IterationTraits::End()) { + finished_ = true; + } + last_value_.reset(); + } + if (next.Finished()) { + finished_ = true; + } + if (next.HasValue()) { + return next.Value(); + } + } + if (finished_) { + return IterationTraits::End(); + } + return util::nullopt; + } + + AsyncGenerator generator_; + Transformer transformer_; + util::optional last_value_; + bool finished_; + }; + + public: + explicit TransformingGenerator(AsyncGenerator generator, + Transformer transformer) + : state_(std::make_shared(std::move(generator), + std::move(transformer))) {} + + Future operator()() { return (*state_)(); } + + protected: + std::shared_ptr state_; +}; + +template +class ReadaheadGenerator { + public: + ReadaheadGenerator(AsyncGenerator source_generator, int max_readahead) + : source_generator_(std::move(source_generator)), max_readahead_(max_readahead) { + auto finished = std::make_shared>(false); + mark_finished_if_done_ = [finished](const Result& next_result) { + if (!next_result.ok()) { + finished->store(true); + } else { + const auto& next = *next_result; + if (next == IterationTraits::End()) { + *finished = true; + } + } + }; + finished_ = std::move(finished); + } + + Future operator()() { + if (readahead_queue_.empty()) { + // This is the first request, let's pump the underlying queue + for (int i = 0; i < max_readahead_; i++) { + auto next = source_generator_(); + next.AddCallback(mark_finished_if_done_); + readahead_queue_.push(std::move(next)); + } + } + // Pop one and add one + auto result = readahead_queue_.front(); + readahead_queue_.pop(); + if (finished_->load()) { + readahead_queue_.push(Future::MakeFinished(IterationTraits::End())); + } else { + auto back_of_queue = source_generator_(); + back_of_queue.AddCallback(mark_finished_if_done_); + readahead_queue_.push(std::move(back_of_queue)); + } + return result; + } + + private: + AsyncGenerator source_generator_; + int max_readahead_; + std::function&)> mark_finished_if_done_; + // Can't use a bool here because finished may be referenced by callbacks that + // outlive this class + std::shared_ptr> finished_; + std::queue> readahead_queue_; +}; + +/// \brief Creates a generator that pulls reentrantly from a source +/// This generator will pull reentrantly from a source, ensuring that max_readahead +/// requests are active at any given time. +/// +/// The source generator must be async-reentrant +/// +/// This generator itself is async-reentrant. +template +AsyncGenerator MakeReadaheadGenerator(AsyncGenerator source_generator, + int max_readahead) { + return ReadaheadGenerator(std::move(source_generator), max_readahead); +} + +/// \brief Transforms an async generator using a transformer function returning a new +/// AsyncGenerator +/// +/// The transform function here behaves exactly the same as the transform function in +/// MakeTransformedIterator and you can safely use the same transform function to +/// transform both synchronous and asynchronous streams. +/// +/// This generator is not async-reentrant +template +AsyncGenerator MakeAsyncGenerator(AsyncGenerator generator, + Transformer transformer) { + return TransformingGenerator(generator, transformer); +} + +/// \brief Transfers execution of the generator onto the given executor +/// +/// This generator is async-reentrant if the source generator is async-reentrant +template +class TransferringGenerator { + public: + explicit TransferringGenerator(AsyncGenerator source, internal::Executor* executor) + : source_(std::move(source)), executor_(executor) {} + + Future operator()() { return executor_->Transfer(source_()); } + + private: + AsyncGenerator source_; + internal::Executor* executor_; +}; + +/// \brief Transfers a future to an underlying executor. +/// +/// Continuations run on the returned future will be run on the given executor +/// if they cannot be run synchronously. +/// +/// This is often needed to move computation off I/O threads or other external +/// completion sources and back on to the CPU executor so the I/O thread can +/// stay busy and focused on I/O +/// +/// Keep in mind that continuations called on an already completed future will +/// always be run synchronously and so no transfer will happen in that case. +template +AsyncGenerator MakeTransferredGenerator(AsyncGenerator source, + internal::Executor* executor) { + return TransferringGenerator(std::move(source), executor); +} + +/// \brief Async generator that iterates on an underlying iterator in a +/// separate executor. +/// +/// This generator is async-reentrant +template +class BackgroundGenerator { + public: + explicit BackgroundGenerator(Iterator it, internal::Executor* io_executor) + : io_executor_(io_executor) { + task_ = Task{std::make_shared>(std::move(it)), + std::make_shared>(false)}; + } + + ~BackgroundGenerator() { + // The thread pool will be disposed of automatically. By default it will not wait + // so the background thread may outlive this object. That should be ok. Any task + // objects in the thread pool are copies of task_ and have their own shared_ptr to + // the iterator. + } + + ARROW_DEFAULT_MOVE_AND_ASSIGN(BackgroundGenerator); + ARROW_DISALLOW_COPY_AND_ASSIGN(BackgroundGenerator); + + Future operator()() { + auto submitted_future = io_executor_->Submit(task_); + if (!submitted_future.ok()) { + return Future::MakeFinished(submitted_future.status()); + } + return std::move(*submitted_future); + } + + protected: + struct Task { + Result operator()() { + if (*done_) { + return IterationTraits::End(); + } + auto next = it_->Next(); + if (!next.ok() || *next == IterationTraits::End()) { + *done_ = true; + } + return next; + } + // This task is going to be copied so we need to convert the iterator ptr to + // a shared ptr. This should be safe however because the background executor only + // has a single thread so it can't access it_ across multiple threads. + std::shared_ptr> it_; + std::shared_ptr> done_; + }; + + Task task_; + internal::Executor* io_executor_; +}; + +/// \brief Creates an AsyncGenerator by iterating over an Iterator on a background +/// thread +template +static Result> MakeBackgroundGenerator( + Iterator iterator, internal::Executor* io_executor) { + auto background_iterator = std::make_shared>( + std::move(iterator), std::move(io_executor)); + return [background_iterator]() { return (*background_iterator)(); }; +} + +/// \brief Converts an AsyncGenerator to an Iterator by blocking until each future +/// is finished +template +class GeneratorIterator { + public: + explicit GeneratorIterator(AsyncGenerator source) : source_(std::move(source)) {} + + Result Next() { return source_().result(); } + + private: + AsyncGenerator source_; +}; + +template +Result> MakeGeneratorIterator(AsyncGenerator source) { + return Iterator(GeneratorIterator(std::move(source))); +} + +template +Result> MakeReadaheadIterator(Iterator it, int readahead_queue_size) { + ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1)); + ARROW_ASSIGN_OR_RAISE(auto background_generator, + MakeBackgroundGenerator(std::move(it), io_executor.get())); + // Capture io_executor to keep it alive as long as owned_bg_generator is still + // referenced + AsyncGenerator owned_bg_generator = [io_executor, background_generator]() { + return background_generator(); + }; + auto readahead_generator = + MakeReadaheadGenerator(std::move(owned_bg_generator), readahead_queue_size); + return MakeGeneratorIterator(std::move(readahead_generator)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index 78d5b15d1c040..d9d6f4f42fa04 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -28,6 +28,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/int128_internal.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h index 0b6199cf15edd..803b825e1b226 100644 --- a/cpp/src/arrow/util/bit_block_counter.h +++ b/cpp/src/arrow/util/bit_block_counter.h @@ -25,6 +25,7 @@ #include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h index 39ff049428d4d..b24632a6e5e42 100644 --- a/cpp/src/arrow/util/bit_run_reader.h +++ b/cpp/src/arrow/util/bit_run_reader.h @@ -24,6 +24,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 74f7e61e9cc16..01845791faabd 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -17,42 +17,14 @@ #pragma once -#ifdef _WIN32 -#define ARROW_LITTLE_ENDIAN 1 -#else -#if defined(__APPLE__) || defined(__FreeBSD__) -#include // IWYU pragma: keep -#else -#include // IWYU pragma: keep -#endif -# -#ifndef __BYTE_ORDER__ -#error "__BYTE_ORDER__ not defined" -#endif -# -#ifndef __ORDER_LITTLE_ENDIAN__ -#error "__ORDER_LITTLE_ENDIAN__ not defined" -#endif -# -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define ARROW_LITTLE_ENDIAN 1 -#else -#define ARROW_LITTLE_ENDIAN 0 -#endif -#endif - #if defined(_MSC_VER) #include // IWYU pragma: keep #include #pragma intrinsic(_BitScanReverse) #pragma intrinsic(_BitScanForward) -#define ARROW_BYTE_SWAP64 _byteswap_uint64 -#define ARROW_BYTE_SWAP32 _byteswap_ulong #define ARROW_POPCOUNT64 __popcnt64 #define ARROW_POPCOUNT32 __popcnt #else -#define ARROW_BYTE_SWAP64 __builtin_bswap64 -#define ARROW_BYTE_SWAP32 __builtin_bswap32 #define ARROW_POPCOUNT64 __builtin_popcountll #define ARROW_POPCOUNT32 __builtin_popcount #endif @@ -61,7 +33,6 @@ #include #include "arrow/util/macros.h" -#include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" namespace arrow { @@ -301,116 +272,6 @@ static inline int Log2(uint64_t x) { return NumRequiredBits(x - 1); } -// -// Byte-swap 16-bit, 32-bit and 64-bit values -// - -// Swap the byte order (i.e. endianness) -static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } -static inline uint64_t ByteSwap(uint64_t value) { - return static_cast(ARROW_BYTE_SWAP64(value)); -} -static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); } -static inline uint32_t ByteSwap(uint32_t value) { - return static_cast(ARROW_BYTE_SWAP32(value)); -} -static inline int16_t ByteSwap(int16_t value) { - constexpr auto m = static_cast(0xff); - return static_cast(((value >> 8) & m) | ((value & m) << 8)); -} -static inline uint16_t ByteSwap(uint16_t value) { - return static_cast(ByteSwap(static_cast(value))); -} -static inline uint8_t ByteSwap(uint8_t value) { return value; } - -// Write the swapped bytes into dst. Src and dst cannot overlap. -static inline void ByteSwap(void* dst, const void* src, int len) { - switch (len) { - case 1: - *reinterpret_cast(dst) = *reinterpret_cast(src); - return; - case 2: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - case 4: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - case 8: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - default: - break; - } - - auto d = reinterpret_cast(dst); - auto s = reinterpret_cast(src); - for (int i = 0; i < len; ++i) { - d[i] = s[len - i - 1]; - } -} - -// Convert to little/big endian format from the machine's native endian format. -#if ARROW_LITTLE_ENDIAN -template > -static inline T ToBigEndian(T value) { - return ByteSwap(value); -} - -template > -static inline T ToLittleEndian(T value) { - return value; -} -#else -template > -static inline T ToBigEndian(T value) { - return value; -} - -template > -static inline T ToLittleEndian(T value) { - return ByteSwap(value); -} -#endif - -// Convert from big/little endian format to the machine's native endian format. -#if ARROW_LITTLE_ENDIAN -template > -static inline T FromBigEndian(T value) { - return ByteSwap(value); -} - -template > -static inline T FromLittleEndian(T value) { - return value; -} -#else -template > -static inline T FromBigEndian(T value) { - return value; -} - -template > -static inline T FromLittleEndian(T value) { - return ByteSwap(value); -} -#endif - // // Utilities for reading and writing individual bits by their index // in a memory area. diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index c71abde9409d7..e5a5e4c39bef5 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -44,7 +44,6 @@ #include "arrow/type_fwd.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_stream_utils.h" -#include "arrow/util/bit_util.h" #include "arrow/util/bitmap.h" #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" @@ -52,6 +51,7 @@ #include "arrow/util/bitmap_visit.h" #include "arrow/util/bitmap_writer.h" #include "arrow/util/bitset_stack.h" +#include "arrow/util/endian.h" namespace arrow { @@ -1786,6 +1786,20 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11)), 0x11); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11)), 0x11); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + uint32_t srci32 = 0xaabbccdd, expectedi32 = 0xddccbbaa; + EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci32)), + *reinterpret_cast(&expectedi32)); + uint64_t srci64 = 0xaabb11223344ccdd, expectedi64 = 0xddcc44332211bbaa; + EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci64)), + *reinterpret_cast(&expectedi64)); } TEST(BitUtil, Log2) { diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h index c26d75f0b5380..8562c55e3d533 100644 --- a/cpp/src/arrow/util/bitmap.h +++ b/cpp/src/arrow/util/bitmap.h @@ -30,6 +30,7 @@ #include "arrow/buffer.h" #include "arrow/util/bit_util.h" #include "arrow/util/compare.h" +#include "arrow/util/endian.h" #include "arrow/util/functional.h" #include "arrow/util/string_builder.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc index 9f1c63653d65c..1f9cf19bbd0e4 100644 --- a/cpp/src/arrow/util/bitmap_ops.cc +++ b/cpp/src/arrow/util/bitmap_ops.cc @@ -28,6 +28,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/bitmap_writer.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index e1412ac8d70fc..cf4f5e7db8ba8 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -22,6 +22,7 @@ #include "arrow/buffer.h" #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" namespace arrow { diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h index 7cb2fc6a98f53..d4f02f37a416e 100644 --- a/cpp/src/arrow/util/bitmap_writer.h +++ b/cpp/src/arrow/util/bitmap_writer.h @@ -21,6 +21,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking.h b/cpp/src/arrow/util/bpacking.h index 71714c4c7d826..e5a4dbbed89ec 100644 --- a/cpp/src/arrow/util/bpacking.h +++ b/cpp/src/arrow/util/bpacking.h @@ -17,6 +17,7 @@ #pragma once +#include "arrow/util/endian.h" #include "arrow/util/visibility.h" #include diff --git a/cpp/src/arrow/util/cache_benchmark.cc b/cpp/src/arrow/util/cache_benchmark.cc new file mode 100644 index 0000000000000..7439ee2f5013f --- /dev/null +++ b/cpp/src/arrow/util/cache_benchmark.cc @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/testing/random.h" +#include "arrow/util/cache_internal.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace internal { + +static constexpr int32_t kCacheSize = 100; +static constexpr int32_t kSmallKeyLength = 8; +static constexpr int32_t kLargeKeyLength = 64; +static constexpr int32_t kSmallValueLength = 16; +static constexpr int32_t kLargeValueLength = 1024; + +static std::vector MakeStrings(int64_t nvalues, int64_t min_length, + int64_t max_length) { + auto rng = ::arrow::random::RandomArrayGenerator(42); + auto arr = checked_pointer_cast(rng.String( + nvalues, static_cast(min_length), static_cast(max_length))); + std::vector vec(nvalues); + for (int64_t i = 0; i < nvalues; ++i) { + vec[i] = arr->GetString(i); + } + return vec; +} + +static std::vector MakeStrings(int64_t nvalues, int64_t length) { + return MakeStrings(nvalues, length, length); +} + +template +static void BenchmarkCacheLookups(benchmark::State& state, const std::vector& keys, + const std::vector& values) { + const int32_t nitems = static_cast(keys.size()); + Cache cache(nitems); + for (int32_t i = 0; i < nitems; ++i) { + cache.Replace(keys[i], values[i]); + } + + for (auto _ : state) { + int64_t nfinds = 0; + for (const auto& key : keys) { + nfinds += (cache.Find(key) != nullptr); + } + benchmark::DoNotOptimize(nfinds); + ARROW_CHECK_EQ(nfinds, nitems); + } + state.SetItemsProcessed(state.iterations() * nitems); +} + +static void LruCacheLookup(benchmark::State& state) { + const auto keys = MakeStrings(kCacheSize, state.range(0)); + const auto values = MakeStrings(kCacheSize, state.range(1)); + BenchmarkCacheLookups>(state, keys, values); +} + +static void SetCacheArgs(benchmark::internal::Benchmark* bench) { + bench->Args({kSmallKeyLength, kSmallValueLength}); + bench->Args({kSmallKeyLength, kLargeValueLength}); + bench->Args({kLargeKeyLength, kSmallValueLength}); + bench->Args({kLargeKeyLength, kLargeValueLength}); +} + +BENCHMARK(LruCacheLookup)->Apply(SetCacheArgs); + +struct Callable { + explicit Callable(std::vector values) + : index_(0), values_(std::move(values)) {} + + std::string operator()(const std::string& key) { + // Return a value unrelated to the key + if (++index_ >= static_cast(values_.size())) { + index_ = 0; + } + return values_[index_]; + } + + private: + int64_t index_; + std::vector values_; +}; + +template +static void BenchmarkMemoize(benchmark::State& state, Memoized&& mem, + const std::vector& keys) { + // Prime memoization cache + for (const auto& key : keys) { + mem(key); + } + + for (auto _ : state) { + int64_t nbytes = 0; + for (const auto& key : keys) { + nbytes += static_cast(mem(key).length()); + } + benchmark::DoNotOptimize(nbytes); + } + state.SetItemsProcessed(state.iterations() * keys.size()); +} + +static void MemoizeLruCached(benchmark::State& state) { + const auto keys = MakeStrings(kCacheSize, state.range(0)); + const auto values = MakeStrings(kCacheSize, state.range(1)); + auto mem = MemoizeLru(Callable(values), kCacheSize); + BenchmarkMemoize(state, mem, keys); +} + +static void MemoizeLruCachedThreadUnsafe(benchmark::State& state) { + const auto keys = MakeStrings(kCacheSize, state.range(0)); + const auto values = MakeStrings(kCacheSize, state.range(1)); + // Emulate recommended usage of MemoizeLruCachedThreadUnsafe + // (the compiler is probably able to cache the TLS-looked up value, though) + thread_local auto mem = MemoizeLruThreadUnsafe(Callable(values), kCacheSize); + BenchmarkMemoize(state, mem, keys); +} + +BENCHMARK(MemoizeLruCached)->Apply(SetCacheArgs); +BENCHMARK(MemoizeLruCachedThreadUnsafe)->Apply(SetCacheArgs); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/cache_internal.h b/cpp/src/arrow/util/cache_internal.h new file mode 100644 index 0000000000000..231fd800b6746 --- /dev/null +++ b/cpp/src/arrow/util/cache_internal.h @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/util/functional.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace internal { + +// A LRU (Least recently used) replacement cache +template +class LruCache { + public: + explicit LruCache(int32_t capacity) : capacity_(capacity) { + // The map size can temporarily exceed the cache capacity, see Replace() + map_.reserve(capacity_ + 1); + } + + ARROW_DISALLOW_COPY_AND_ASSIGN(LruCache); + ARROW_DEFAULT_MOVE_AND_ASSIGN(LruCache); + + void Clear() { + items_.clear(); + map_.clear(); + // The C++ spec doesn't tell whether map_.clear() will shrink the map capacity + map_.reserve(capacity_ + 1); + } + + int32_t size() const { + DCHECK_EQ(items_.size(), map_.size()); + return static_cast(items_.size()); + } + + template + Value* Find(K&& key) { + const auto it = map_.find(key); + if (it == map_.end()) { + return nullptr; + } else { + // Found => move item at front of the list + auto list_it = it->second; + items_.splice(items_.begin(), items_, list_it); + return &list_it->value; + } + } + + template + std::pair Replace(K&& key, V&& value) { + // Try to insert temporary iterator + auto pair = map_.emplace(std::forward(key), ListIt{}); + const auto it = pair.first; + const bool inserted = pair.second; + if (inserted) { + // Inserted => push item at front of the list, and update iterator + items_.push_front(Item{&it->first, std::forward(value)}); + it->second = items_.begin(); + // Did we exceed the cache capacity? If so, remove least recently used item + if (static_cast(items_.size()) > capacity_) { + const bool erased = map_.erase(*items_.back().key); + DCHECK(erased); + ARROW_UNUSED(erased); + items_.pop_back(); + } + return {true, &it->second->value}; + } else { + // Already exists => move item at front of the list, and update value + auto list_it = it->second; + items_.splice(items_.begin(), items_, list_it); + list_it->value = std::forward(value); + return {false, &list_it->value}; + } + } + + private: + struct Item { + // Pointer to the key inside the unordered_map + const Key* key; + Value value; + }; + using List = std::list; + using ListIt = typename List::iterator; + + const int32_t capacity_; + // In most to least recently used order + std::list items_; + std::unordered_map map_; +}; + +namespace detail { + +template +struct ThreadSafeMemoizer { + using RetType = Value; + + template + ThreadSafeMemoizer(F&& func, int32_t cache_capacity) + : func_(std::forward(func)), cache_(cache_capacity) {} + + // The memoizer can't return a pointer to the cached value, because + // the cache entry may be evicted by another thread. + + Value operator()(const Key& key) { + std::unique_lock lock(mutex_); + const Value* value_ptr; + value_ptr = cache_.Find(key); + if (ARROW_PREDICT_TRUE(value_ptr != nullptr)) { + return *value_ptr; + } + lock.unlock(); + Value v = func_(key); + lock.lock(); + return *cache_.Replace(key, std::move(v)).second; + } + + private: + std::mutex mutex_; + Func func_; + Cache cache_; +}; + +template +struct ThreadUnsafeMemoizer { + using RetType = const Value&; + + template + ThreadUnsafeMemoizer(F&& func, int32_t cache_capacity) + : func_(std::forward(func)), cache_(cache_capacity) {} + + const Value& operator()(const Key& key) { + const Value* value_ptr; + value_ptr = cache_.Find(key); + if (ARROW_PREDICT_TRUE(value_ptr != nullptr)) { + return *value_ptr; + } + return *cache_.Replace(key, func_(key)).second; + } + + private: + Func func_; + Cache cache_; +}; + +template